Пример #1
0
def process( in_biom, out_biom, out_metadata ):
    ordered_blast_keys = ["taxonomy", "subject", "evalue", "perc_identity", "perc_query_coverage", "aln_length"] # Keys in blast_affiliations metadata
    taxonomy_depth = 0
    unclassified_observations = list()

    FH_metadata = open( out_metadata, "w" )
    FH_metadata.write( "#OTUID\t" + "\t".join([item for item in ordered_blast_keys]) + "\n" )
    biom = BiomIO.from_json( in_biom )
    for observation in biom.get_observations():
        for metadata_key in observation["metadata"].keys():
            if metadata_key == "blast_affiliations": # Extract blast_affiliations metadata in metadata_file
                if observation["metadata"][metadata_key] is not None:
                    for current_affi in observation["metadata"][metadata_key]:
                        if isinstance(current_affi["taxonomy"], list) or isinstance(current_affi["taxonomy"], tuple):
                            current_affi["taxonomy"] = ";".join( current_affi["taxonomy"] )
                        FH_metadata.write( observation["id"] + "\t" + "\t".join([str(current_affi[item]) for item in ordered_blast_keys]) + "\n" )
                del observation["metadata"][metadata_key]
            elif observation["metadata"][metadata_key] is not None: # All list are transformed in string
                if isinstance(observation["metadata"][metadata_key], list) or isinstance(observation["metadata"][metadata_key], tuple):
                    observation["metadata"][metadata_key] = ";".join( map(str, observation["metadata"][metadata_key]) )
        if observation["metadata"].has_key( "blast_taxonomy" ):
            if observation["metadata"]["blast_taxonomy"] is None:
                unclassified_observations.append( observation["id"] )
                observation["metadata"]["taxonomy"] = list()
            else:
                taxonomy_depth = len(observation["metadata"]["blast_taxonomy"].split(";"))
                observation["metadata"]["taxonomy"] = observation["metadata"]["blast_taxonomy"].split(";")
    # Add "Unclassified" ranks in unclassified observations
    if taxonomy_depth > 0:
        for observation_id in unclassified_observations:
            observation_metadata = biom.get_observation_metadata(observation_id)
            observation_metadata["taxonomy"] = ["Unclassified"] * taxonomy_depth
    BiomIO.write( out_biom, biom )
Пример #2
0
def aff_to_metadata(reference_file,
                    biom_in,
                    biom_out,
                    blast_files=None,
                    rdp_files=None):
    """
    @summary: Add taxonomy metadata on biom file from a blast result.
    @param reference_file: [str] The path to the reference file.
    @param biom_in: [str] The path to the Biom file to process.
    @param biom_out: [str] The path to the biom output file.
    @param blast_files: [list] the list of the path to the blast results in tabular format (outfmt 6 with NCBI Blast+).
    @param rdp_files: [list] the list of path to the RDPClassifier results.
    """
    # Build an hash with the taxonomy for each gene (key=gene_id ; value=gene_taxonomy)
    taxonomy_by_reference = get_tax_from_fasta(reference_file)

    # Retrieve blast clusters annotations
    cluster_blast_annot = dict()
    if blast_files is not None:
        cluster_blast_annot = get_bests_blast_affi(blast_files,
                                                   taxonomy_by_reference)
    del taxonomy_by_reference

    # Retrieve rdp clusters annotations
    cluster_rdp_annot = dict()
    if rdp_files is not None:
        cluster_rdp_annot = get_rdp_affi(rdp_files)

    # Add metadata to biom
    biom = BiomIO.from_json(biom_in)
    for cluster in biom.get_observations():
        cluster_id = cluster["id"]
        # Blast
        if blast_files is not None:
            blast_taxonomy = list()
            blast_affiliations = list()
            if cluster_id in cluster_blast_annot:  # Current observation has a match
                blast_taxonomy = get_tax_consensus([
                    taxonomy.split(';') for taxonomy in
                    cluster_blast_annot[cluster_id]['alignments']
                ])
                for taxonomy in cluster_blast_annot[cluster_id]['alignments']:
                    blast_affiliations.extend(cluster_blast_annot[cluster_id]
                                              ['alignments'][taxonomy])
            biom.add_metadata(cluster_id, "blast_affiliations",
                              blast_affiliations, "observation")
            biom.add_metadata(cluster_id, "blast_taxonomy", blast_taxonomy,
                              "observation")
        # RDP
        if rdp_files is not None:
            rdp_taxonomy = list()
            rdp_bootstrap = list()
            if cluster_id in cluster_rdp_annot:
                rdp_taxonomy = cluster_rdp_annot[cluster_id]['taxonomy']
                rdp_bootstrap = cluster_rdp_annot[cluster_id]['bootstrap']
            biom.add_metadata(cluster_id, "rdp_taxonomy", rdp_taxonomy,
                              "observation")
            biom.add_metadata(cluster_id, "rdp_bootstrap", rdp_bootstrap,
                              "observation")
    BiomIO.write(biom_out, biom)
Пример #3
0
def sampling_by_sample( input_biom, output_biom, nb_sampled=None, sampled_ratio=None ):
    """
    @summary: Writes a BIOM after a random sampling in each sample.
    @param input_biom: [str] Path to the processed BIOM.
    @param output_biom: [str] Path to outputed BIOM.
    @param nb_sampled: [int] Number of sampled sequences by sample.
    @param sampled_ratio: [float] Ratio of sampled sequences by sample.
    @note: nb_sampled and sampled_ratio are mutually exclusive.
    """
    initial_biom = BiomIO.from_json( input_biom )
    new_biom = Biom(
                    matrix_type="sparse",
                    generated_by="Sampling " + (str(nb_sampled) if nb_sampled is not None else str(sampled_ratio) + "%" ) + " elements by sample from " + input_biom
    )
    observations_already_added = dict()
    for sample_name in initial_biom.get_samples_names():
        new_biom.add_sample( sample_name, initial_biom.get_sample_metadata(sample_name) )
        sample_seq = initial_biom.get_sample_count(sample_name)
        sample_nb_sampled = nb_sampled
        if nb_sampled is None:
            sample_nb_sampled = int(sample_seq * sampled_ratio)
        if sample_seq < nb_sampled:
            raise_exception( Exception( "\n\n#ERROR : " + str(sample_nb_sampled) + " sequences cannot be sampled in sample '" + str(sample_name) + "'. It only contains " + str(sample_seq) + " sequences.\n\n" ))
        else:
            for current_nb_iter in range(sample_nb_sampled):
                # Take an observation in initial BIOM
                selected_observation = initial_biom.random_obs_by_sample(sample_name)
                selected_observation_id = selected_observation['id']
                initial_biom.subtract_count( selected_observation_id, sample_name, 1 )
                # Put in new BIOM
                if selected_observation_id not in observations_already_added:
                    new_biom.add_observation( selected_observation_id, initial_biom.get_observation_metadata(selected_observation_id) )
                    observations_already_added[selected_observation_id] = True
                new_biom.add_count( selected_observation_id, sample_name, 1 )
    BiomIO.write( output_biom, new_biom )
Пример #4
0
def process( in_biom, out_biom, out_metadata ):
    ordered_blast_keys = ["taxonomy", "subject", "evalue", "perc_identity", "perc_query_coverage", "aln_length"] # Keys in blast_affiliations metadata
    taxonomy_depth = 0
    unclassified_observations = list()

    FH_metadata = open( out_metadata, "w" )
    FH_metadata.write( "#OTUID\t" + "\t".join([item for item in ordered_blast_keys]) + "\n" )
    biom = BiomIO.from_json( in_biom )
    for observation in biom.get_observations():
        for metadata_key in observation["metadata"].keys():
            if metadata_key == "blast_affiliations": # Extract blast_affiliations metadata in metadata_file
                if observation["metadata"][metadata_key] is not None:
                    for current_affi in observation["metadata"][metadata_key]:
                        if isinstance(current_affi["taxonomy"], list) or isinstance(current_affi["taxonomy"], tuple):
                            current_affi["taxonomy"] = ";".join( current_affi["taxonomy"] )
                        FH_metadata.write( observation["id"] + "\t" + "\t".join([str(current_affi[item]) for item in ordered_blast_keys]) + "\n" )
                del observation["metadata"][metadata_key]
            elif observation["metadata"][metadata_key] is not None: # All list are transformed in string
                if isinstance(observation["metadata"][metadata_key], list) or isinstance(observation["metadata"][metadata_key], tuple):
                    observation["metadata"][metadata_key] = ";".join( map(str, observation["metadata"][metadata_key]) )
        if observation["metadata"].has_key( "blast_taxonomy" ):
            if observation["metadata"]["blast_taxonomy"] is None:
                unclassified_observations.append( observation["id"] )
                observation["metadata"]["taxonomy"] = list()
            else:
                taxonomy_depth = len(observation["metadata"]["blast_taxonomy"].split(";"))
                observation["metadata"]["taxonomy"] = observation["metadata"]["blast_taxonomy"].split(";")
    # Add "Unclassified" ranks in unclassified observations
    if taxonomy_depth > 0:
        for observation_id in unclassified_observations:
            observation_metadata = biom.get_observation_metadata(observation_id)
            observation_metadata["taxonomy"] = ["Unclassified"] * taxonomy_depth
    BiomIO.write( out_biom, biom )
Пример #5
0
def sampling_by_sample( input_biom, output_biom, nb_sampled=None, sampled_ratio=None ):
    """
    @summary: Writes a BIOM after a random sampling in each sample.
    @param input_biom: [str] Path to the processed BIOM.
    @param output_biom: [str] Path to outputed BIOM.
    @param nb_sampled: [int] Number of sampled sequences by sample.
    @param sampled_ratio: [float] Ratio of sampled sequences by sample.
    @note: nb_sampled and sampled_ratio are mutually exclusive.
    """
    initial_biom = BiomIO.from_json( input_biom )
    new_biom = Biom(
                    matrix_type="sparse",
                    generated_by="Sampling " + (str(nb_sampled) if nb_sampled is not None else str(sampled_ratio) + "%" ) + " elements by sample from " + input_biom
    )
    observations_already_added = dict()
    for sample_name in initial_biom.get_samples_names():
        new_biom.add_sample( sample_name, initial_biom.get_sample_metadata(sample_name) )
        sample_seq = initial_biom.get_sample_count(sample_name)
        sample_nb_sampled = nb_sampled
        if nb_sampled is None:
            sample_nb_sampled = int(sample_seq * sampled_ratio)
        if sample_seq < nb_sampled:
            raise Exception( str(sample_nb_sampled) + " sequences cannot be sampled in sample '" + str(sample_name) + "'. It only contains " + str(sample_seq) + " sequences." )
        else:
            for current_nb_iter in range(sample_nb_sampled):
                # Take an observation in initial BIOM
                selected_observation = initial_biom.random_obs_by_sample(sample_name)
                selected_observation_id = selected_observation['id']
                initial_biom.subtract_count( selected_observation_id, sample_name, 1 )
                # Put in new BIOM
                if not observations_already_added.has_key(selected_observation_id):
                    new_biom.add_observation( selected_observation_id, initial_biom.get_observation_metadata(selected_observation_id) )
                    observations_already_added[selected_observation_id] = True
                new_biom.add_count( selected_observation_id, sample_name, 1 )
    BiomIO.write( output_biom, new_biom )
Пример #6
0
def mask_observation(rdp_clusters_discards, blast_clusters_discards,
                     input_biom, output_biom):
    """
    @summary : mask either rdp affiliations and/or blast affiliations
    @param rdp_clusters_discards : [list] of clusters whith rdp affiliations to mask
    @param blast_clusters_discards : [list] of clusters whith blast consensus affiliations to mask
    @param input_biom : [str] Path to input biom file
    @param input_biom : [str] Path to output biom file with affiliations masked
    """

    biom = BiomIO.from_json(input_biom)
    for observation in biom.get_observations():
        # remove rdp taxonomic metadata
        if rdp_clusters_discards is not None and observation[
                'id'] in rdp_clusters_discards:
            if issubclass(observation['metadata']["rdp_taxonomy"].__class__,
                          str):
                observation['metadata']["rdp_taxonomy"] = ""
                observation['metadata']["rdp_bootstrap"] = ""
            elif issubclass(observation['metadata']["rdp_taxonomy"].__class__,
                            str):
                observation['metadata']["rdp_taxonomy"] = list()
                observation['metadata']["rdp_bootstrap"] = list()

        # remove blast metadata
        if observation['id'] in blast_clusters_discards:
            observation['metadata']["blast_affiliations"] = list()
            observation['metadata']["blast_taxonomy"] = list()

    BiomIO.write(output_biom, biom)
Пример #7
0
def to_biom( clusters_file, count_file, output_biom, size_separator ):
    """
    @summary : Write a biom file from swarm results.
    @param clusters_file : [str] path to the '.clstr' file.
    @param count_file : [str] path to the count file. It contains the count of
                         sequences by sample of each preclusters.
                         Line format : "Precluster_id    nb_in_sampleA    nb_in_sampleB"
    @param output_biom : [str] path to the output file.
    @param size_separator : [str] the pre-cluster abundance separator.
    """
    biom = Biom( generated_by='swarm', matrix_type="sparse" )

    # Preclusters count by sample
    preclusters_count = dict()
    count_fh = open( count_file )
    samples = count_fh.readline().strip().split()[1:]
    for line in count_fh:
        precluster_id, count_str = line.strip().split(None, 1)
        preclusters_count[precluster_id] = count_str # For large dataset store count into a string consumes minus RAM than a sparse count
    count_fh.close()

    # Add samples
    for sample_name in samples:
        biom.add_sample( sample_name )

    # Process count
    cluster_idx = 1
    clusters_fh = open( clusters_file )
    for line in clusters_fh:
        seed_id = line.strip().split()[0]
        if "FROGS_combined" in seed_id:
            cluster_name = "Cluster_" + str(cluster_idx) + "_FROGS_combined"
            comment = "WARNING"
        else:
            cluster_name = "Cluster_" + str(cluster_idx)
            comment = "na"
        cluster_count = {key:0 for key in samples}
        line_fields = line.strip().split()
        # Retrieve count by sample
        for seq_id in line_fields:
            real_seq_id = seq_id.rsplit(size_separator, 1)[0]
            sample_counts = preclusters_count[real_seq_id].split()
            for sample_idx, sample_name in enumerate(samples):
                cluster_count[sample_name] += int(sample_counts[sample_idx])
            preclusters_count[real_seq_id] = None
        # Add cluster on biom
        biom.add_observation( cluster_name, {'comment': comment, 'seed_id':line_fields[0].rsplit(size_separator, 1)[0]} )
        observation_idx = biom.find_idx("observation", cluster_name)
        for sample_idx, sample_name in enumerate(samples):
            if cluster_count[sample_name] > 0:
                biom.data.change( observation_idx, sample_idx, cluster_count[sample_name] )
        # Next cluster
        cluster_idx += 1

    # Write
    BiomIO.write( output_biom, biom )
Пример #8
0
def to_biom(clusters_file, count_file, output_biom, size_separator):
    """
    @summary : Write a biom file from swarm results.
    @param clusters_file : [str] path to the '.clstr' file.
    @param count_file : [str] path to the count file. It contains the count of
                         sequences by sample of each preclusters.
                         Line format : "Precluster_id    nb_in_sampleA    nb_in_sampleB"
    @param output_biom : [str] path to the output file.
    @param size_separator : [str] the pre-cluster abundance separator.
    """
    biom = Biom(generated_by='swarm', matrix_type="sparse")

    # Preclusters count by sample
    preclusters_count = dict()
    count_fh = open(count_file)
    samples = count_fh.readline().strip().split()[1:]
    for line in count_fh:
        line_fields = line.strip().split()
        count_by_sample = {}
        for idx, val in enumerate(line_fields[1:]):
            if val > 0:
                count_by_sample[samples[idx]] = int(val)
        preclusters_count[line_fields[0]] = count_by_sample
    count_fh.close()

    # Add samples
    for sample_name in samples:
        biom.add_sample(sample_name)

    # Process count
    cluster_idx = 1
    clusters_fh = open(clusters_file)
    for line in clusters_fh:
        cluster_name = "Cluster_" + str(cluster_idx)
        cluster_count = {key: 0 for key in samples}
        line_fields = line.strip().split()
        # Retrieve count by sample
        for seq_id in line_fields:
            real_seq_id = seq_id.rsplit(size_separator, 1)[0]
            for preclust_sample in preclusters_count[real_seq_id]:
                cluster_count[preclust_sample] += preclusters_count[
                    real_seq_id][preclust_sample]
            preclusters_count[real_seq_id] = None
        # Add cluster on biom
        biom.add_observation(
            cluster_name,
            {'seed_id': line_fields[0].rsplit(size_separator, 1)[0]})
        for sample_name in samples:
            if cluster_count[sample_name] > 0:
                biom.add_count(cluster_name, sample_name,
                               cluster_count[sample_name])
        # Next cluster
        cluster_idx += 1

    # Write
    BiomIO.write(output_biom, biom)
Пример #9
0
def remove_observations(removed_observations, input_biom, output_biom):
    """
    @summary: Removes the specified list of observations.
    @param removed_observations: [list] The names of the observations to remove.
    @param input_biom: [str] The path to the input BIOM.
    @param output_biom: [str] The path to the output BIOM.
    """
    biom = BiomIO.from_json(input_biom)
    biom.remove_observations(removed_observations)
    BiomIO.write(output_biom, biom)
Пример #10
0
def filter_biom( removed_observations, in_biom, out_biom ):
    """
    @summary: Removed the specified observations from BIOM.
    @param removed_observations: [dict] Each key is an observation name.
    @param in_biom: [str]: Path to the processed BIOM file.
    @param out_biom: [str]: Path to the cleaned BIOM file.
    """
    biom = BiomIO.from_json(in_biom)
    biom.remove_observations(removed_observations)
    BiomIO.write(out_biom, biom)
Пример #11
0
def remove_observations( removed_observations, input_biom, output_biom ):
    """
    @summary: Removes the specified list of observations.
    @param removed_observations: [list] The names of the observations to remove.
    @param input_biom: [str] The path to the input BIOM.
    @param output_biom: [str] The path to the output BIOM.
    """
    biom = BiomIO.from_json( input_biom )
    biom.remove_observations( removed_observations )
    BiomIO.write( output_biom, biom )
Пример #12
0
def filter_biom( removed_observations, in_biom, out_biom ):
    """
    @summary: Removed the specified observations from BIOM.
    @param removed_observations: [dict] Each key is an observation name.
    @param in_biom: [str]: Path to the processed BIOM file.
    @param out_biom: [str]: Path to the cleaned BIOM file.
    """
    biom = BiomIO.from_json(in_biom)
    biom.remove_observations(removed_observations)
    BiomIO.write(out_biom, biom)
Пример #13
0
def to_biom( clusters_file, count_file, output_biom, size_separator ):
    """
    @summary : Write a biom file from swarm results.
    @param clusters_file : [str] path to the '.clstr' file.
    @param count_file : [str] path to the count file. It contains the count of
                         sequences by sample of each preclusters.
                         Line format : "Precluster_id    nb_in_sampleA    nb_in_sampleB"
    @param output_biom : [str] path to the output file.
    @param size_separator : [str] the pre-cluster abundance separator.
    """
    biom = Biom( generated_by='swarm', matrix_type="sparse" )

    # Preclusters count by sample
    preclusters_count = dict()
    count_fh = open( count_file )
    samples = count_fh.readline().strip().split()[1:]
    for line in count_fh:
        precluster_id, count_str = line.strip().split(None, 1)
        preclusters_count[precluster_id] = count_str # For large dataset store count into a string consumes minus RAM than a sparse count
    count_fh.close()

    # Add samples
    for sample_name in samples:
        biom.add_sample( sample_name )

    # Process count
    cluster_idx = 1
    clusters_fh = open( clusters_file )
    for line in clusters_fh:
        cluster_name = "Cluster_" + str(cluster_idx)
        cluster_count = {key:0 for key in samples}
        line_fields = line.strip().split()
        # Retrieve count by sample
        for seq_id in line_fields:
            real_seq_id = seq_id.rsplit(size_separator, 1)[0]
            sample_counts = preclusters_count[real_seq_id].split()
            for sample_idx, sample_name in enumerate(samples):
                cluster_count[sample_name] += int(sample_counts[sample_idx])
            preclusters_count[real_seq_id] = None
        # Add cluster on biom
        biom.add_observation( cluster_name, {'seed_id':line_fields[0].rsplit(size_separator, 1)[0]} )
        observation_idx = biom.find_idx("observation", cluster_name)
        for sample_idx, sample_name in enumerate(samples):
            if cluster_count[sample_name] > 0:
                biom.data.change( observation_idx, sample_idx, cluster_count[sample_name] )
        # Next cluster
        cluster_idx += 1

    # Write
    BiomIO.write( output_biom, biom )
Пример #14
0
def get_checked( abund_file, checked_sample, taxonomy_key, expected_by_depth ):
    checked_by_depth = dict()
    biom = BiomIO.from_json(abund_file)
    for current_obs in biom.get_observations():
        clean_taxonomy = getCleanedTaxonomy(current_obs["metadata"][taxonomy_key]) if current_obs["metadata"][taxonomy_key] is not None else ["unknown_taxa"]*len(expected_by_depth)
        count = biom.get_count(current_obs["id"], checked_sample)
        if count > 0:
            if clean_taxonomy[len(clean_taxonomy)-1] == "Multi-affiliation":
                nb_selected = 0
                selected = list()
                taxonomies = list()
                expected_taxonomies = expected_by_depth[len(clean_taxonomy)-1]
                for affi_idx in range(len(current_obs["metadata"]["blast_affiliations"])):
                    affi_taxonomy = ";".join(getCleanedTaxonomy(current_obs["metadata"]["blast_affiliations"][affi_idx]["taxonomy"]))
                    if affi_taxonomy not in taxonomies:
                        taxonomies.append(affi_taxonomy)
                        if affi_taxonomy in expected_taxonomies:
                            selected = getCleanedTaxonomy(current_obs["metadata"]["blast_affiliations"][affi_idx]["taxonomy"])
                            nb_selected += 1
                if nb_selected == 1:
                    clean_taxonomy = selected
                else:
                    warnings.warn( "Multi-affiliation cannot be resolved for " + str((float(count)*100)/biom.get_total_count()) + "% sequences. Possible taxonomies: '" + "', '".join(taxonomies) + "'." )
            for rank_depth in range(len(clean_taxonomy)):
                rank_taxonomy = ";".join(clean_taxonomy[:rank_depth + 1])
                if rank_depth not in checked_by_depth:
                    checked_by_depth[rank_depth] = dict()
                if rank_taxonomy not in checked_by_depth[rank_depth]:
                    checked_by_depth[rank_depth][rank_taxonomy] = 0
                checked_by_depth[rank_depth][rank_taxonomy] += count
    return checked_by_depth
Пример #15
0
def get_retrieved_by_sample( biom_file, reference_by_obs_id, references_by_sample, uniq_id, uniq_id_by_sample ):
    counts_by_sample = dict()
    biom = BiomIO.from_json( biom_file )
    for sample_name in biom.get_samples_names():
        nb_detected = 0
        retrieved = dict()
        expected_retrieved = dict()
        for obs in biom.get_observations_by_sample( sample_name ):
            nb_detected += 1
            if not "," in reference_by_obs_id[obs['id']]: # Is not a chimera
                ref_id = reference_by_obs_id[obs['id']]
                retrieved[ref_id] = 1
                if ref_id in references_by_sample[sample_name]:
                    expected_retrieved[ref_id] = 1
        # Uniq sequence for retrieved
        uniq_retrieved = set()
        for ref_id in retrieved:
            uniq_retrieved.add( uniq_id[ref_id] )
        # Uniq sequence for retrieved
        uniq_expected_retrieved = set()
        for ref_id in expected_retrieved:
            uniq_expected_retrieved.add( uniq_id_by_sample[sample_name][ref_id] )
        # Results
        counts_by_sample[sample_name] = {
            "detected": nb_detected,
            "retrieved": len(uniq_retrieved),
            "expected_retrieved": len(uniq_expected_retrieved)
        }
    return counts_by_sample
Пример #16
0
def excluded_obs_on_blastMetrics( input_biom, tag, cmp_operator, threshold, excluded_file ):
    """
    @summary: Writes the list of the observations with no affiliations with sufficient blast value.
    @param input_biom: [str] The path to the BIOM file to check.
    @param tag: [str] The metadata checked.
    @param cmp_operator: [str] The operator use in comparison (tag_value ">=" thresold or tag_value "<=" thresold ).
    @param threshold: [float] The limit for the tag value.
    @param excluded_file: [str] The path to the output file.
    """
    valid_operators = {
        ">=": operator.__ge__,
        "<=": operator.__le__
    }
    cmp_func = valid_operators[cmp_operator]
    biom = BiomIO.from_json( input_biom )
    FH_excluded_file = open( excluded_file, "w" )
    for observation in biom.get_observations():
        alignments = observation["metadata"]["blast_affiliations"]
        is_discarded = True
        for current_alignment in alignments:
            if cmp_func(float(current_alignment[tag]), threshold):
                is_discarded = False
        if is_discarded:
            FH_excluded_file.write( str(observation["id"]) + "\n" )
    FH_excluded_file.close()
Пример #17
0
def getRealTaxByRefID( input_biom, taxonomy_key, duplication_groups ):
    """
    @summary: Return taxonomy by reference.
    @param input_biom: [str] Path to BIOM file.
    @param taxonomy_key: [str] The metadata key for taxonomy.
    @param duplication_groups: [dict] By reference ID the list of references with the same sequence.
    @return: [dict] List of taxonomies by reference ID.
             Example: 
               {
                 "MVF01000012.1.1317": [
                   ["Root", "Bacteria", "Proteobacteria", "Gammaproteobacteria", "Enterobacteriales", "Enterobacteriaceae", "Cronobacter", "Escherichia coli BIDMC 73"]
                 ],
                 "JQ607252.1.1437": [
                   ["Root", "Bacteria", "Firmicutes", "Bacilli", "Bacillales", "Staphylococcaceae", "Staphylococcus", "bacterium NLAE-zl-P471"],
                   ["Root", "Bacteria", "Firmicutes", "Bacilli", "Bacillales", "Staphylococcaceae", "Staphylococcus", "Staphylococcus aureus M17299"]
                 ] 
               }
    """
    taxonomy_by_obs_id = dict()
    tmp_taxonomy_by_obs_id = dict()
    biom = BiomIO.from_json( input_biom )
    for observation in biom.get_observations():
        taxonomy_clean = getCleanedTaxonomy(observation["metadata"][taxonomy_key])
        taxonomy_by_obs_id[observation["id"]] = [taxonomy_clean]
        tmp_taxonomy_by_obs_id[observation["id"]] = taxonomy_clean
    if duplication_groups is not None:
        for obs_id in duplication_groups:
            taxonomy_by_obs_id[obs_id] = list()
            for id_duplicated_seq in duplication_groups[obs_id]: # For each duplication group member
                taxonomy_by_obs_id[obs_id].append(tmp_taxonomy_by_obs_id[id_duplicated_seq])
    return taxonomy_by_obs_id
Пример #18
0
 def get_step_size(self, nb_step=35):
     """
     @summary: Returns the step size to obtain 'nb_step' steps or more in 3/4 of samples.
     @param nb_step: [int] The number of expected steps.
     @returns: [int] The step size.
     """
     counts = list()
     # Get the number of sequences by sample
     biom = BiomIO.from_json( self.in_biom )
     for sample_name in biom.get_samples_names():
         counts.append( biom.get_sample_count(sample_name) )
     del biom
     counts = sorted(counts)
     nb_samples = len(counts)
     # Finds the lower quartile number of sequences
     lower_quartile_idx = nb_samples/4
     nb_seq = counts[lower_quartile_idx]
     # If lower quartile sample is empty
     if nb_seq == 0:
         idx = 1
         while (lower_quartile_idx + idx) < nb_samples and counts[lower_quartile_idx + idx] == 0:
             idx += 1
         if (lower_quartile_idx + idx) < nb_samples:
             nb_seq = counts[lower_quartile_idx + idx]
     step_size = int(nb_seq/nb_step)
     return max(1, step_size)
Пример #19
0
def observations_depth( input_biom, output_depth ):
    """
    @summary : Write the depths of the observation in file.
    @param input_biom : [str] path to the biom file processed.
    @param output_depth : [str] path to the output file.
    @note : Example of one output file
                #Depth<TAB>Nb_Observ_concerned<TAB>Prct_Observ_concerned
                1<TAB>65<TAB>65.000
                2<TAB>30<TAB>30.000
                3<TAB>0<TAB>0.000
                4<TAB>5<TAB>5.000
    """
    obs_depth = list()
    nb_observ = 0
    # Process depth calculation
    biom = BiomIO.from_json( input_biom )
    for observation_id, observation_count in biom.get_observations_counts():
        while len(obs_depth) <= observation_count:
            obs_depth.append(0)
        obs_depth[observation_count] += 1
        if observation_count != 0:
            nb_observ += 1
    del biom
    # Write output
    out_fh = open( output_depth, 'w' )
    out_fh.write( "#Depth\tNb_Observ_concerned\tPrct_Observ_concerned\n" )
    for depth in range(1, len(obs_depth)):
        prct = (float(obs_depth[depth])/ nb_observ)*100
        out_fh.write( str(depth) + "\t" + str(obs_depth[depth]) + "\t" + ("%.3f" % prct) + "\n" )
    out_fh.close()
Пример #20
0
def getRealTaxByRefID( input_biom, taxonomy_key, duplication_groups ):
    """
    @summary: Return taxonomy by reference.
    @param input_biom: [str] Path to BIOM file.
    @param taxonomy_key: [str] The metadata key for taxonomy.
    @param duplication_groups: [dict] By reference ID the list of references with the same sequence.
    @return: [dict] List of taxonomies by reference ID.
             Example: 
               {
                 "MVF01000012.1.1317": [
                   ["Root", "Bacteria", "Proteobacteria", "Gammaproteobacteria", "Enterobacteriales", "Enterobacteriaceae", "Cronobacter", "Escherichia coli BIDMC 73"]
                 ],
                 "JQ607252.1.1437": [
                   ["Root", "Bacteria", "Firmicutes", "Bacilli", "Bacillales", "Staphylococcaceae", "Staphylococcus", "bacterium NLAE-zl-P471"],
                   ["Root", "Bacteria", "Firmicutes", "Bacilli", "Bacillales", "Staphylococcaceae", "Staphylococcus", "Staphylococcus aureus M17299"]
                 ] 
               }
    """
    taxonomy_by_obs_id = dict()
    tmp_taxonomy_by_obs_id = dict()
    biom = BiomIO.from_json( input_biom )
    for observation in biom.get_observations():
        taxonomy_clean = getCleanedTaxonomy(observation["metadata"][taxonomy_key])
        taxonomy_by_obs_id[observation["id"]] = [taxonomy_clean]
        tmp_taxonomy_by_obs_id[observation["id"]] = taxonomy_clean
    if duplication_groups is not None:
        for obs_id in duplication_groups:
            taxonomy_by_obs_id[obs_id] = list()
            for id_duplicated_seq in duplication_groups[obs_id]: # For each duplication group member
                taxonomy_by_obs_id[obs_id].append(tmp_taxonomy_by_obs_id[id_duplicated_seq])
    return taxonomy_by_obs_id
Пример #21
0
def write_log(in_biom, out_biom, log):
    FH_log=open(log,"w")
    FH_log.write("#sample\tnb_otu_before\tnb_otu_after\n")
    initial_biom = BiomIO.from_json( in_biom )
    new_biom = BiomIO.from_json( out_biom )

    for sample_name in initial_biom.get_samples_names():
        nb_otu_before = len(initial_biom.get_sample_obs(sample_name))
        nb_otu_after = len(new_biom.get_sample_obs(sample_name))
        FH_log.write("Sample name: "+sample_name+"\n\tnb initials OTU: "+str(nb_otu_before)+"\n\tnb normalized OTU: "+str(nb_otu_after)+"\n")

    nb_initial_otu=len(initial_biom.rows)
    nb_new_otu=len(new_biom.rows)
    FH_log.write("Sample name: all samples\n\tnb initials OTU: "+str(nb_initial_otu)+"\n\tnb normalized OTU: "+str(nb_new_otu)+"\n")

    FH_log.close()
Пример #22
0
def get_checked( abund_file, checked_sample, taxonomy_key, expected_by_depth ):
    checked_by_depth = dict()
    biom = BiomIO.from_json(abund_file)
    for current_obs in biom.get_observations():
        clean_taxonomy = getCleanedTaxonomy(current_obs["metadata"][taxonomy_key])
        count = biom.get_count(current_obs["id"], checked_sample)
        if count > 0:
            if clean_taxonomy[len(clean_taxonomy)-1] == "Multi-affiliation":
                nb_selected = 0
                selected = list()
                taxonomies = list()
                expected_taxonomies = expected_by_depth[len(clean_taxonomy)-1]
                for affi_idx in range(len(current_obs["metadata"]["blast_affiliations"])):
                    affi_taxonomy = ";".join(getCleanedTaxonomy(current_obs["metadata"]["blast_affiliations"][affi_idx]["taxonomy"]))
                    if affi_taxonomy not in taxonomies:
                        taxonomies.append(affi_taxonomy)
                        if affi_taxonomy in expected_taxonomies:
                            selected = getCleanedTaxonomy(current_obs["metadata"]["blast_affiliations"][affi_idx]["taxonomy"])
                            nb_selected += 1
                if nb_selected == 1:
                    clean_taxonomy = selected
                else:
                    warnings.warn( "Multi-affiliation cannot be resolved for " + str((float(count)*100)/biom.get_total_count()) + "% sequences. Possible taxonomies: '" + "', '".join(taxonomies) + "'." )
            for rank_depth in range(len(clean_taxonomy)):
                rank_taxonomy = ";".join(clean_taxonomy[:rank_depth + 1])
                if rank_depth not in checked_by_depth:
                    checked_by_depth[rank_depth] = dict()
                if rank_taxonomy not in checked_by_depth[rank_depth]:
                    checked_by_depth[rank_depth][rank_taxonomy] = 0
                checked_by_depth[rank_depth][rank_taxonomy] += count
    return checked_by_depth
Пример #23
0
def get_checkedTax(real_tax, input_biom, taxonomy_key, multi_affiliation,
                   logfile):
    """
    @summary:
    @param real_tax: [dict] Taxonomy.
    @param input_biom: [str] Path to BIOM file.
    @param taxonomy_key: [str] The metadata key for taxonomy.
    @param multi_affiliation: [bool] ************************************************************************************
    @return: [dict] The dictionary of count by taxa in dictionary by rank.   
    """
    nb_seq = 0
    tax_list = list()
    biom = BiomIO.from_json(input_biom)
    for observation in biom.get_observations():
        nb_seq += 1
        # Get taxonomy
        if not multi_affiliation:  # Standard affiliation
            taxonomy_clean = getCleanedTaxonomy(
                observation["metadata"][taxonomy_key])
            if ";".join(taxonomy_clean) not in tax_list:
                tax_list.append(";".join(taxonomy_clean))
        else:  # Multi-affiliation
            possible_taxonomies = [
                getCleanedTaxonomy(affi["taxonomy"])
                for affi in observation["metadata"]["blast_affiliations"]
            ]
            checkMultiaffiliation(real_tax, observation["id"],
                                  possible_taxonomies, logfile)
            for taxonomy_clean in possible_taxonomies:
                if ";".join(taxonomy_clean) not in tax_list:
                    tax_list.append(";".join(taxonomy_clean))

    return nb_seq, tax_list
Пример #24
0
def biom_fasta_to_tsv( input_biom, input_fasta, output_tsv, fields, list_separator ):
    """
    @summary: Convert BIOM file to TSV file with sequence.
    @param input_biom: [str] Path to the BIOM file.
    @param input_fasta: [str] Path to the sequences of the observations.
    @param output_tsv: [str] Path to the output file (format : TSV).
    @param fields: [list] Columns and their order in output. Special columns : '@observation_name', '@observation_sum', '@sample_count', '@rdp_tax_and_bootstrap', '@seed_sequence'. The others columns must be metadata title.
    @param list_separator: [str] Separator for complex metadata.
    """
    biom = BiomIO.from_json( input_biom )
    out_fh = open( output_tsv, "w" )
    sequence_idx = fields.index("@seed_sequence")
    # Header
    header_parts = header_line_parts( fields, biom )
    out_fh.write( "#" + "\t".join(header_parts) + "\n" )
    # Data
    fields_without_seq = fields
    del fields_without_seq[sequence_idx]
    FH_in = FastaIO( input_fasta )
    for record in FH_in:
        obs_idx = biom.find_idx("observation", record.id)
        count_by_sample = biom.data.get_row_array(obs_idx)
        observation_parts = observation_line_parts( biom.rows[obs_idx], count_by_sample, fields_without_seq, list_separator )
        observation_parts.insert( sequence_idx, record.string )
        out_fh.write( "\t".join(observation_parts) + "\n" )
    out_fh.close()
Пример #25
0
    def __init__( self, out_tsv, in_biom, in_fasta=None ):
        """
        @param in_biom: [str] Path to BIOM file.
        @param out_tsv: [str] Path to output TSV file.
        """
        # Sequence file option
        sequence_file_opt = "" if in_fasta is None else " --input-fasta " + in_fasta

        # Check the metadata
        biom = BiomIO.from_json( in_biom )
        conversion_tags = ""
        if biom.has_observation_metadata( 'rdp_taxonomy' ) and biom.has_observation_metadata( 'rdp_bootstrap' ):
            conversion_tags += "'@rdp_tax_and_bootstrap' "
        if biom.has_observation_metadata( 'blast_taxonomy' ):
            conversion_tags += "'blast_taxonomy' "
        if biom.has_observation_metadata( 'blast_affiliations' ):
            conversion_tags += "'@blast_subject' "
            conversion_tags += "'@blast_perc_identity' "
            conversion_tags += "'@blast_perc_query_coverage' "
            conversion_tags += "'@blast_evalue' "
            conversion_tags += "'@blast_aln_length' "
        if biom.has_observation_metadata( 'seed_id' ):
            conversion_tags += "'seed_id' "
        if in_fasta is not None:
            conversion_tags += "'@seed_sequence' "
        conversion_tags += "'@observation_name' '@observation_sum' '@sample_count'"

        # Set command
        Cmd.__init__( self,
                      'biom2tsv.py',
                      'Converts a BIOM file in TSV file.',
                      "--input-file " + in_biom + sequence_file_opt + " --output-file " + out_tsv + " --fields " + conversion_tags,
                      '--version' )
Пример #26
0
def biom_fasta_to_tsv( input_biom, input_fasta, output_tsv, fields, list_separator ):
    """
    @summary: Convert BIOM file to TSV file with sequence.
    @param input_biom: [str] Path to the BIOM file.
    @param input_fasta: [str] Path to the sequences of the observations.
    @param output_tsv: [str] Path to the output file (format : TSV).
    @param fields: [list] Columns and their order in output. Special columns : '@observation_name', '@observation_sum', '@sample_count', '@rdp_tax_and_bootstrap', '@seed_sequence'. The others columns must be metadata title.
    @param list_separator: [str] Separator for complex metadata.
    """
    biom = BiomIO.from_json( input_biom )
    out_fh = open( output_tsv, "w" )
    sequence_idx = fields.index("@seed_sequence")
    # Header
    header_parts = header_line_parts( fields, biom )
    out_fh.write( "#" + "\t".join(header_parts) + "\n" )
    # Data
    fields_without_seq = fields
    del fields_without_seq[sequence_idx]
    FH_in = FastaIO( input_fasta )
    for record in FH_in:
        obs_idx = biom.find_idx("observation", record.id)
        count_by_sample = biom.data.get_row_array(obs_idx)
        observation_parts = observation_line_parts( biom.rows[obs_idx], count_by_sample, fields_without_seq, list_separator )
        observation_parts.insert( sequence_idx, record.string )
        out_fh.write( "\t".join(observation_parts) + "\n" )
    out_fh.close()
Пример #27
0
def observations_depth( input_biom, output_depth ):
    """
    @summary : Write the depths of the observation in file.
    @param input_biom : [str] path to the biom file processed.
    @param output_depth : [str] path to the output file.
    @note : Example of one output file
                #Depth<TAB>Nb_Observ_concerned<TAB>Prct_Observ_concerned
                1<TAB>65<TAB>65.000
                2<TAB>30<TAB>30.000
                3<TAB>0<TAB>0.000
                4<TAB>5<TAB>5.000
    """
    obs_depth = list()
    nb_observ = 0
    # Process depth calculation
    biom = BiomIO.from_json( input_biom )
    for observation_id, observation_count in biom.get_observations_counts():
        while len(obs_depth) <= observation_count:
            obs_depth.append(0)
        obs_depth[observation_count] += 1
        if observation_count != 0:
            nb_observ += 1
    del biom
    # Write output
    out_fh = open( output_depth, 'w' )
    out_fh.write( "#Depth\tNb_Observ_concerned\tPrct_Observ_concerned\n" )
    for depth in range(1, len(obs_depth)):
        prct = (float(obs_depth[depth])/ nb_observ)*100
        out_fh.write( str(depth) + "\t" + str(obs_depth[depth]) + "\t" + ("%.3f" % prct) + "\n" )
    out_fh.close()
Пример #28
0
def getRealAbunByRank(taxonomy_key, input_biom, sample):
    """
    @summary: Returns count by taxa by rank in sample.
    @param taxonomy_by_ref_id: [dict] Taxonomies by reference IDs.
    @param input_biom: [str] Path to BIOM file.
    @param sample: [str] sample name.
    @return: [dict] The dictionary of count by taxa in dictionary by rank.
    """
    abund_by_rank = list()
    tax_list = list()
    biom = BiomIO.from_json(input_biom)
    for observation in biom.get_observations():
        count = biom.get_count(observation["id"], sample)
        if count > 0:
            taxonomy_clean = getCleanedTaxonomy(
                observation["metadata"][taxonomy_key])
            if not ";".join(taxonomy_clean) in tax_list:
                tax_list.append(";".join(taxonomy_clean))
            for depth in range(len(taxonomy_clean)):
                if len(abund_by_rank) < depth + 1:
                    abund_by_rank.append(dict())
                taxon = ";".join(
                    taxonomy_clean[:depth + 1]
                )  # prevent bug with same sp name but with different ancestors
                if not abund_by_rank[depth].has_key(taxon):
                    abund_by_rank[depth][taxon] = 0
                abund_by_rank[depth][taxon] += count
    return tax_list, abund_by_rank
Пример #29
0
    def __init__(self, out_tsv, in_biom, in_fasta=None):
        """
        @param in_biom: [str] Path to BIOM file.
        @param out_tsv: [str] Path to output TSV file.
        """
        # Sequence file option
        sequence_file_opt = "" if in_fasta is None else " --input-fasta " + in_fasta

        # Check the metadata
        biom = BiomIO.from_json(in_biom)
        conversion_tags = ""
        if biom.has_observation_metadata(
                'rdp_taxonomy') and biom.has_observation_metadata(
                    'rdp_bootstrap'):
            conversion_tags += "'@rdp_tax_and_bootstrap' "
        if biom.has_observation_metadata('blast_taxonomy'):
            conversion_tags += "'blast_taxonomy' "
        if biom.has_observation_metadata('blast_affiliations'):
            conversion_tags += "'@blast_subject' "
            conversion_tags += "'@blast_perc_identity' "
            conversion_tags += "'@blast_perc_query_coverage' "
            conversion_tags += "'@blast_evalue' "
            conversion_tags += "'@blast_aln_length' "
        if biom.has_observation_metadata('seed_id'):
            conversion_tags += "'seed_id' "
        if in_fasta is not None:
            conversion_tags += "'@seed_sequence' "
        conversion_tags += "'@observation_name' '@observation_sum' '@sample_count'"

        # Set command
        Cmd.__init__(
            self, 'biom2tsv.py', 'Converts a BIOM file in TSV file.',
            "--input-file " + in_biom + sequence_file_opt + " --output-file " +
            out_tsv + " --fields " + conversion_tags, '--version')
Пример #30
0
 def get_step_size(self, nb_step=35):
     """
     @summary: Returns the step size to obtain 'nb_step' steps or more in 3/4 of samples.
     @param nb_step: [int] The number of expected steps.
     @returns: [int] The step size.
     """
     counts = list()
     # Get the number of sequences by sample
     biom = BiomIO.from_json(self.in_biom)
     for sample_name in biom.get_samples_names():
         counts.append(biom.get_sample_count(sample_name))
     del biom
     counts = sorted(counts)
     nb_samples = len(counts)
     # Finds the lower quartile number of sequences
     lower_quartile_idx = nb_samples / 4
     nb_seq = counts[lower_quartile_idx]
     # If lower quartile sample is empty
     if nb_seq == 0:
         idx = 1
         while (lower_quartile_idx +
                idx) < nb_samples and counts[lower_quartile_idx + idx] == 0:
             idx += 1
         if (lower_quartile_idx + idx) < nb_samples:
             nb_seq = counts[lower_quartile_idx + idx]
     step_size = int(nb_seq / nb_step)
     return max(1, step_size)
Пример #31
0
def get_retrieved_by_sample( biom_file, reference_by_obs_id, references_by_sample, uniq_id, uniq_id_by_sample ):
    counts_by_sample = dict()
    biom = BiomIO.from_json( biom_file )
    for sample_name in biom.get_samples_names():
        nb_detected = 0
        retrieved = dict()
        expected_retrieved = dict()
        for obs in biom.get_observations_by_sample( sample_name ):
            nb_detected += 1
            if not "," in reference_by_obs_id[obs['id']]: # Is not a chimera
                ref_id = reference_by_obs_id[obs['id']]
                retrieved[ref_id] = 1
                if ref_id in references_by_sample[sample_name]:
                    expected_retrieved[ref_id] = 1
        # Uniq sequence for retrieved
        uniq_retrieved = set()
        for ref_id in retrieved:
            uniq_retrieved.add( uniq_id[ref_id] )
        # Uniq sequence for retrieved
        uniq_expected_retrieved = set()
        for ref_id in expected_retrieved:
            uniq_expected_retrieved.add( uniq_id_by_sample[sample_name][ref_id] )
        # Results
        counts_by_sample[sample_name] = {
            "detected": nb_detected,
            "retrieved": len(uniq_retrieved),
            "expected_retrieved": len(uniq_expected_retrieved)
        }
    return counts_by_sample
Пример #32
0
def write_log(in_biom, out_biom, log):
    FH_log=open(log,"w")
    FH_log.write("#sample\tnb_otu_before\tnb_otu_after\n")
    initial_biom = BiomIO.from_json( in_biom )
    new_biom = BiomIO.from_json( out_biom )

    for sample_name in initial_biom.get_samples_names():
        nb_otu_before = len(initial_biom.get_sample_obs(sample_name))
        nb_otu_after = len(new_biom.get_sample_obs(sample_name))
        FH_log.write("Sample name: "+sample_name+"\n\tnb initials OTU: "+str(nb_otu_before)+"\n\tnb normalized OTU: "+str(nb_otu_after)+"\n")

    nb_initial_otu=len(initial_biom.rows)
    nb_new_otu=len(new_biom.rows)
    FH_log.write("Sample name: all samples\n\tnb initials OTU: "+str(nb_initial_otu)+"\n\tnb normalized OTU: "+str(nb_new_otu)+"\n")

    FH_log.close()
def aff_to_metadata(reference_file, biom_in, biom_out, blast_files=None, rdp_files=None):
    """
    @summary: Add taxonomy metadata on biom file from a blast result.
    @param reference_file: [str] The path to the reference file.
    @param biom_in: [str] The path to the Biom file to process.
    @param biom_out: [str] The path to the biom output file.
    @param blast_files: [list] the list of the path to the blast results in tabular format (outfmt 6 with NCBI Blast+).
    @param rdp_files: [list] the list of path to the RDPClassifier results.
    """
    # Build an hash with the taxonomy for each gene (key=gene_id ; value=gene_taxonomy)
    taxonomy_by_reference = get_tax_from_fasta( reference_file )

    # Retrieve blast clusters annotations
    cluster_blast_annot = dict()
    if blast_files is not None:
        cluster_blast_annot = get_bests_blast_affi( blast_files, taxonomy_by_reference )
    del taxonomy_by_reference

    # Retrieve rdp clusters annotations
    cluster_rdp_annot = dict()
    if rdp_files is not None:
        cluster_rdp_annot = get_rdp_affi( rdp_files )

    # Add metadata to biom
    biom = BiomIO.from_json(biom_in)
    for cluster in biom.get_observations():
        cluster_id = cluster["id"]
        # Blast
        if blast_files is not None:
            blast_taxonomy = None
            blast_affiliations = list()
            if cluster_blast_annot.has_key(cluster_id): # Current observation has a match
                blast_taxonomy = get_tax_consensus( [alignment['taxonomy'] for alignment in cluster_blast_annot[cluster_id]['alignments']] )
                blast_affiliations = cluster_blast_annot[cluster_id]['alignments']
            biom.add_metadata( cluster_id, "blast_affiliations", blast_affiliations, "observation" )
            biom.add_metadata( cluster_id, "blast_taxonomy", blast_taxonomy, "observation" )
        # RDP
        if rdp_files is not None:
            rdp_taxonomy = None
            rdp_bootstrap = None
            if cluster_rdp_annot.has_key(cluster_id):
                rdp_taxonomy = cluster_rdp_annot[cluster_id]['taxonomy']
                rdp_bootstrap = cluster_rdp_annot[cluster_id]['bootstrap']
            biom.add_metadata(cluster_id, "rdp_taxonomy", rdp_taxonomy, "observation")
            biom.add_metadata(cluster_id, "rdp_bootstrap", rdp_bootstrap, "observation")
    BiomIO.write(biom_out, biom)
Пример #34
0
def samples_hclassification( input_biom, output_newick, distance_method, linkage_method, min_count=1 ):
    """
    @summary : Process and write an hierarchical classification from Biom.
    @param input_biom : [str] Path to the BIOM file to process.
    @param output_newick : [str] Path to the newick output file.
    @param distance_method : [str] Used distance method for classify.
    @param linkage_method : [str] Used linkage method for classify.
    @param min_count : [int] Samples with a count lower than this value are not processed.
    """
    from scipy.spatial.distance import pdist, squareform
    from scipy.cluster.hierarchy import linkage, dendrogram
    import scipy.cluster.hierarchy
    data_array = list()
    processed_samples = list()
    excluded_samples = list()
    nb_samples = None

    # Normalisation on count by sample
    biom = BiomIO.from_json( input_biom )
    for col_idx, current_sample in enumerate(biom.columns):
        sum_on_sample = biom.data.get_col_sum( col_idx )
        if sum_on_sample < min_count:
			excluded_samples.append( current_sample['id'] )
        else:
            processed_samples.append( current_sample['id'] )
            OTUs_norm = list()
            for row_idx in range(len(biom.rows)):
                OTUs_norm.append( biom.data.nb_at(row_idx, col_idx)/float(sum_on_sample) )
            data_array.append( OTUs_norm )
    nb_samples = len(biom.columns)
    del biom

    # Process distance
    if len(processed_samples) < 1:
        raise Exception("All samples have a count lower than threshold (" + str(min_count) + ").")
    elif len(processed_samples) == 1:
        # Write newick
        out_fh = open( output_newick, "w" )
        out_fh.write( "(" + processed_samples[0] + ");\n" )
        out_fh.close()
    else:
        # Computing the distance and linkage
        data_dist = pdist( data_array, distance_method )
        data_link = linkage( data_dist, linkage_method )
        # Write newick
        scipy_hc_tree = scipy.cluster.hierarchy.to_tree( data_link , rd=False )
        id_2_name = dict( zip(range(len(processed_samples)), processed_samples) )
        out_fh = open( output_newick, "w" )
        out_fh.write( to_newick(scipy_hc_tree, id_2_name) + "\n" )
        out_fh.close()

    # Display log
    print "# Hierarchical clustering log:\n" + \
          "\tNumber of samples in BIOM: " + str(nb_samples) + "\n" + \
          "\tNumber of processed samples: " + str(len(processed_samples))
    if nb_samples > len(processed_samples):
        print "\n\tExcluded samples (count < " + str(min_count) + "): " + ", ".join(sorted(excluded_samples))
Пример #35
0
def samples_hclassification( input_biom, output_newick, distance_method, linkage_method, min_count=1 ):
    """
    @summary : Process and write an hierarchical classification from Biom.
    @param input_biom : [str] Path to the BIOM file to process.
    @param output_newick : [str] Path to the newick output file.
    @param distance_method : [str] Used distance method for classify.
    @param linkage_method : [str] Used linkage method for classify.
    @param min_count : [int] Samples with a count lower than this value are not processed.
    """
    from scipy.spatial.distance import pdist, squareform
    from scipy.cluster.hierarchy import linkage, dendrogram
    import scipy.cluster.hierarchy
    data_array = list()
    processed_samples = list()
    excluded_samples = list()
    nb_samples = None

    # Normalisation on count by sample
    biom = BiomIO.from_json( input_biom )
    for col_idx, current_sample in enumerate(biom.columns):
        sum_on_sample = biom.data.get_col_sum( col_idx )
        if sum_on_sample < min_count:
            excluded_samples.append( current_sample['id'] )
        else:
            processed_samples.append( current_sample['id'] )
            OTUs_norm = list()
            for row_idx in range(len(biom.rows)):
                OTUs_norm.append( biom.data.nb_at(row_idx, col_idx)/float(sum_on_sample) )
            data_array.append( OTUs_norm )
    nb_samples = len(biom.columns)
    del biom

    # Process distance
    if len(processed_samples) < 1:
        raise_exception( Exception("\n\n#ERROR :All samples have a count lower than threshold (" + str(min_count) + ").\n\n"))
    elif len(processed_samples) == 1:
        # Write newick
        out_fh = open( output_newick, "wt" )
        out_fh.write( "(" + processed_samples[0] + ");\n" )
        out_fh.close()
    else:
        # Computing the distance and linkage
        data_dist = pdist( data_array, distance_method )
        data_link = linkage( data_dist, linkage_method )
        # Write newick
        scipy_hc_tree = scipy.cluster.hierarchy.to_tree( data_link , rd=False )
        id_2_name = dict( list(zip(list(range(len(processed_samples))), processed_samples)) )
        out_fh = open( output_newick, "wt" )
        out_fh.write( to_newick(scipy_hc_tree, id_2_name) + "\n" )
        out_fh.close()

    # Display log
    print(("# Hierarchical clustering log:\n" + \
          "\tNumber of samples in BIOM: " + str(nb_samples) + "\n" + \
          "\tNumber of processed samples: " + str(len(processed_samples))))
    if nb_samples > len(processed_samples):
        print(("\n\tExcluded samples (count < " + str(min_count) + "): " + ", ".join(sorted(excluded_samples))))
Пример #36
0
def process(args):
    tmp_files = TmpFiles(os.path.split(args.output_file)[0])

    try:
        # Add temp taxonomy if multiple and without consensus
        tmp_biom = args.input_biom
        used_taxonomy_tag = args.taxonomy_tag
        if args.multiple_tag is not None:
            used_taxonomy_tag = args.tax_consensus_tag
            if args.tax_consensus_tag is None:
                used_taxonomy_tag = "Used_taxonomy_FROGS-affi"
                tmp_biom = tmp_files.add("tax.biom")
                biom = BiomIO.from_json(args.input_biom)
                for observation in biom.get_observations():
                    metadata = observation["metadata"]
                    if metadata[args.multiple_tag] is not None and len(
                            metadata[args.multiple_tag]) > 0:
                        metadata[used_taxonomy_tag] = metadata[
                            args.multiple_tag][0][args.taxonomy_tag]
                BiomIO.write(tmp_biom, biom)
                del biom

        # Rarefaction
        tax_depth = [
            args.taxonomic_ranks.index(rank) for rank in args.rarefaction_ranks
        ]
        rarefaction_cmd = Rarefaction(tmp_biom, tmp_files, used_taxonomy_tag,
                                      tax_depth)
        rarefaction_cmd.submit(args.log_file)
        rarefaction_files = rarefaction_cmd.output_files

        # Taxonomy tree
        tree_count_file = tmp_files.add("taxCount.enewick")
        tree_ids_file = tmp_files.add("taxCount_ids.tsv")
        TaxonomyTree(tmp_biom, used_taxonomy_tag, tree_count_file,
                     tree_ids_file).submit(args.log_file)

        # Writes summary
        write_summary(args.output_file, args.input_biom, tree_count_file,
                      tree_ids_file, rarefaction_files, args)
    finally:
        if not args.debug:
            tmp_files.deleteAll()
Пример #37
0
def rarefaction( input_biom, interval=10000, ranks=None, taxonomy_key="taxonomy" ):
    """
    @summary: Returns the rarefaction by ranks by samples.
    @param input_biom: [str] Path to the biom file processed.
    @param interval: [int] Size of first sampling.
    @param ranks: [list] The rank(s) level for the diversity.
                   Example :
                     Sampled set :
                       Bacteria; Proteobacteria; Alphaproteobacteria; Sphingomonadales; Sphingomonadaceae; Sphingomonas
                       Bacteria; Proteobacteria; Gammaproteobacteria; Vibrionales; Vibrionaceae; Vibrio; Vibrio halioticoli
                       Bacteria; Proteobacteria; Gammaproteobacteria; Legionellales; Coxiellaceae; Coxiella; Ornithodoros moubata symbiont A
                       Bacteria; Proteobacteria; Betaproteobacteria; Burkholderiales; Burkholderiaceae; Limnobacter; Limnobacter thiooxidans
                     Result for this set
                       With rank 1 or 2 : 1 group
                       With rank 3 : 3 different groups
                       With rank 4 or 5 or 6 : 4 different groups
    @param taxonomy_key : [str] The metadata title for the taxonomy in the input.
    @return: [dict] By ranks by samples the list of differents taxa for each steps.
              Example :
                 {
                   1: {
                        "sampleA" : [10, 20, 22, 23, 24, 25, 25, 25 ],
                        "sampleB" : [15, 25, 28, 30, 32, 34, 35, 36, 37, 37, 37, 37]
                      }
                 }
    @warning: The taxa with name starting with unknown used as complete new name 'unknown'.
    """
    sample_rarefaction = dict()
    biom = BiomIO.from_json( input_biom )
    for current_rank in ranks:
        sample_rarefaction[current_rank] = dict()
    for sample in biom.get_samples_names():
        taxa = dict()
        for current_rank in ranks:
            sample_rarefaction[current_rank][sample] = list()
            taxa[current_rank] = dict()
        sample_count = biom.get_sample_count( sample )
        expected_nb_iter = int(sample_count/interval)
        for current_nb_iter in range(expected_nb_iter):
            selected_observations = biom.random_obs_extract_by_sample(sample, interval)
            for current_selected in selected_observations:
                taxonomy = list()
                if taxonomy_key in current_selected['observation']["metadata"] and current_selected['observation']["metadata"][taxonomy_key] is not None:
                    taxonomy = biom.get_observation_taxonomy( current_selected['observation']["id"], taxonomy_key )
                for idx, taxon in enumerate(taxonomy):
                    if taxon.lower().startswith("unknown"):
                        taxonomy[idx] = "unknown"
                while len(taxonomy) < max(ranks):
                    taxonomy.append("unknown")
                for current_rank in ranks:
                    taxonomy_str = (';'.join(taxonomy[0:current_rank+1])).lower()
                    taxa[current_rank][taxonomy_str] = True
            for current_rank in ranks:
                sample_rarefaction[current_rank][sample].append( str(len(taxa[current_rank])) )
    return sample_rarefaction
Пример #38
0
def rarefaction( input_biom, interval=10000, ranks=None, taxonomy_key="taxonomy" ):
    """
    @summary: Returns the rarefaction by ranks by samples.
    @param input_biom: [str] Path to the biom file processed.
    @param interval: [int] Size of first sampling.
    @param ranks: [list] The rank(s) level for the diversity.
                   Example :
                     Sampled set :
                       Bacteria; Proteobacteria; Alphaproteobacteria; Sphingomonadales; Sphingomonadaceae; Sphingomonas
                       Bacteria; Proteobacteria; Gammaproteobacteria; Vibrionales; Vibrionaceae; Vibrio; Vibrio halioticoli
                       Bacteria; Proteobacteria; Gammaproteobacteria; Legionellales; Coxiellaceae; Coxiella; Ornithodoros moubata symbiont A
                       Bacteria; Proteobacteria; Betaproteobacteria; Burkholderiales; Burkholderiaceae; Limnobacter; Limnobacter thiooxidans
                     Result for this set
                       With rank 1 or 2 : 1 group
                       With rank 3 : 3 different groups
                       With rank 4 or 5 or 6 : 4 different groups
    @param taxonomy_key : [str] The metadata title for the taxonomy in the input.
    @return: [dict] By ranks by samples the list of differents taxa for each steps.
              Example :
                 {
                   1: {
                        "sampleA" : [10, 20, 22, 23, 24, 25, 25, 25 ],
                        "sampleB" : [15, 25, 28, 30, 32, 34, 35, 36, 37, 37, 37, 37]
                      }
                 }
    @warning: The taxa with name starting with unknown used as complete new name 'unknown'.
    """
    sample_rarefaction = dict()
    biom = BiomIO.from_json( input_biom )
    for current_rank in ranks:
        sample_rarefaction[current_rank] = dict()
    for sample in biom.get_samples_names():
        taxa = dict()
        for current_rank in ranks:
            sample_rarefaction[current_rank][sample] = list()
            taxa[current_rank] = dict()
        sample_count = biom.get_sample_count( sample )
        expected_nb_iter = sample_count/interval
        for current_nb_iter in range(expected_nb_iter):
            selected_observations = biom.random_obs_extract_by_sample(sample, interval)
            for current_selected in selected_observations:
                taxonomy = list()
                if current_selected['observation']["metadata"].has_key(taxonomy_key) and current_selected['observation']["metadata"][taxonomy_key] is not None:
                    taxonomy = biom.get_observation_taxonomy( current_selected['observation']["id"], taxonomy_key )
                for idx, taxon in enumerate(taxonomy):
                    if taxon.lower().startswith("unknown"):
                        taxonomy[idx] = "unknown"
                while len(taxonomy) < max(ranks):
                    taxonomy.append("unknown")
                for current_rank in ranks:
                    taxonomy_str = (';'.join(taxonomy[0:current_rank+1])).lower()
                    taxa[current_rank][taxonomy_str] = True
            for current_rank in ranks:
                sample_rarefaction[current_rank][sample].append( str(len(taxa[current_rank])) )
    return sample_rarefaction
Пример #39
0
def get_obs_from_biom( in_biom ):
    """
    @summary: Returns the counts by observation from a BIOM file.
    @param in_biom: Path to the BIOM.
    @return: [dict] Returns the counts by observation.
    """
    observ_dict = dict()
    biom = BiomIO.from_json(in_biom)
    for observation_name in biom.get_observations_names():
        observ_dict[observation_name] = biom.get_observation_count(observation_name)
    del biom
    return observ_dict
Пример #40
0
def get_bootstrap_distrib(input_biom, bootstrap_tag, multiple_tag):
    """
    @summary: Returns by taxonomic rank the count (seq and clstr) for the different bootstrap categories.
    @param input_biom: The path to the processed BIOM.
    @param bootstrap_tag: The metadata tag used in BIOM file to store the taxonomy bootstraps.
    @param multiple_tag: The metadata tag used in BIOM file to store the list of possible taxonomies.
    @returns: [dict] By taxonomic rank the count for the different bootstrap categories.
              Example:
                {
                    "Phylum": {
                        "80": { "clstr": 1, "seq":100 },
                        "90": {    "clstr": 2,    "seq":400 },
                        "100": { "clstr": 50, "seq":20000 },
                    },
                    "Genus":{
                        "80":{ "clstr": 1, "seq":100 },
                        "90":{ "clstr": 2, "seq":400 },
                        "100":{ "clstr": 50, "seq":20000 },
                    }
                }
    """
    bootstrap_results = dict()

    biom = BiomIO.from_json(input_biom)
    for observation in biom.get_observations():
        observation_metadata = observation['metadata']
        bootstrap = None
        if multiple_tag is not None:
            if multiple_tag in observation_metadata and observation_metadata[
                    multiple_tag] is not None and len(
                        observation_metadata[multiple_tag]) > 0:
                bootstrap = observation_metadata[multiple_tag][0][
                    bootstrap_tag]
        else:
            if bootstrap_tag in observation_metadata:
                bootstrap = observation_metadata[bootstrap_tag]
        if bootstrap is not None:
            for taxonomy_depth, rank_bootstrap in enumerate(bootstrap):
                rank_bootstrap = rank_bootstrap * 100
                rank = args.taxonomic_ranks[taxonomy_depth]
                if rank not in bootstrap_results:
                    bootstrap_results[rank] = dict()
                if rank_bootstrap not in bootstrap_results[rank]:
                    bootstrap_results[rank][rank_bootstrap] = {
                        "clstr": 0,
                        "seq": 0
                    }
                bootstrap_results[rank][rank_bootstrap]["clstr"] += 1
                bootstrap_results[rank][rank_bootstrap][
                    "seq"] += biom.get_observation_count(observation['id'])
    del biom
    return bootstrap_results
Пример #41
0
def excluded_obs_on_nBiggest( input_biom, nb_selected, excluded_file ):
    """
    @summary: Writes the list of all the observations without the n most abundant.
    @param input_biom: [str] The path to the BIOM file.
    @param threshold: [float] The number of the most abundant observations that will not be written in the excluded list.
    @param excluded_file: [str] The path to the output file.
    """
    biom = BiomIO.from_json( input_biom )
    FH_excluded_file = open( excluded_file, "w" )
    sorted_obs_counts = sorted( biom.get_observations_counts(), key=lambda observation: observation[1], reverse=True )
    for observation_name, observation_count in sorted_obs_counts[nb_selected:]:
        FH_excluded_file.write( observation_name + "\n" )
    FH_excluded_file.close()
Пример #42
0
def get_obs_from_biom(in_biom):
    """
    @summary: Returns the counts by observation from a BIOM file.
    @param in_biom: Path to the BIOM.
    @return: [dict] Returns the counts by observation.
    """
    observ_dict = dict()
    biom = BiomIO.from_json(in_biom)
    for observation_name in biom.get_observations_names():
        observ_dict[observation_name] = biom.get_observation_count(
            observation_name)
    del biom
    return observ_dict
Пример #43
0
def impacted_obs_by_undesired_taxon(input_biom, undesired_taxon_list,
                                    in_all_or_in_consensus, biom_out,
                                    impacted_file):
    """
    @summary : write the list of observation with affiliations including undesired taxon.
    @param input_biom: [str] The path to the BIOM file to check.
    @param undesired_taxon_list: [list] list of string to look for
    @param in_all_or_in_consensus: [bool] if True, one taxon_ignored must be in the consensus or all affiliation must one of the taxon ignored
    @param biom_out: [str] path to biom with removed undesired taxonomy
    @param impacted_file: [str] The path to the output file.
    """
    biom = BiomIO.from_json(input_biom)
    FH_impacted_file = open(impacted_file, "w")

    for observation in biom.get_observations():

        # update blast_affiliations without ignored taxon and recompute de blast_taxonomy
        new_blast_affi = list()
        for affiliation in observation['metadata']['blast_affiliations']:
            if not any(t in ";".join(affiliation["taxonomy"])
                       for t in undesired_taxon_list):
                new_blast_affi.append(affiliation)

        # if some affi are masked, update blast_affiliations and blast_taxonomy
        if len(new_blast_affi) != len(
                observation['metadata']['blast_affiliations']):
            observation['metadata']['blast_affiliations'] = new_blast_affi
            new_consensus = get_tax_consensus(
                [affi['taxonomy'] for affi in new_blast_affi])
            # delete mode if all affiliations belons to one of undesired taxon
            if in_all_or_in_consensus and len(new_blast_affi) == 0:
                FH_impacted_file.write(str(observation["id"]) + "\n")
            # masking mode if the new consensus is changed because of ignoring undesired taxon
            elif not in_all_or_in_consensus and new_consensus != observation[
                    'metadata']['blast_taxonomy']:
                FH_impacted_file.write(str(observation["id"]) + "\n")
            observation['metadata']['blast_taxonomy'] = new_consensus

    BiomIO.write(biom_out, biom)
Пример #44
0
def getCheckedAbunByRank(real_tax, input_biom, sample, taxonomy_key,
                         multi_affiliation, logfile):
    """
    @summary:
    @param real_tax: [dict] Taxonomy by reference IDs.
    @param input_biom: [str] Path to BIOM file.
    @param sample: [str] sample name.
    @param taxonomy_key: [str] The metadata key for taxonomy.
    @param multi_affiliation: [bool] ************************************************************************************
    @return: [dict] The dictionary of count by taxa in dictionary by rank.   
    """
    abund_by_rank = list()
    tax_list = list()
    full_tax_list = list()
    nb_seq = 0
    biom = BiomIO.from_json(input_biom)
    for observation in biom.get_observations():
        count = biom.get_count(observation["id"], sample)
        if count > 0:
            nb_seq += 1
            # Get taxonomy
            if not multi_affiliation:  # Standard affiliation
                taxonomy_clean = getCleanedTaxonomy(
                    observation["metadata"][taxonomy_key])
                if ";".join(taxonomy_clean) not in full_tax_list:
                    full_tax_list.append(";".join(taxonomy_clean))
            else:  # Multi-affiliation
                possible_taxonomies = [
                    getCleanedTaxonomy(affi["taxonomy"])
                    for affi in observation["metadata"]["blast_affiliations"]
                ]
                for taxonomy_clean in possible_taxonomies:
                    if ";".join(taxonomy_clean) not in full_tax_list:
                        full_tax_list.append(";".join(taxonomy_clean))
                taxonomy_clean = selectOneMultiaffiliation(
                    real_tax, observation["id"], possible_taxonomies, logfile)

            if ";".join(taxonomy_clean) not in tax_list:
                tax_list.append(";".join(taxonomy_clean))

            # Store count
            for depth in range(len(taxonomy_clean)):
                if len(abund_by_rank) < depth + 1:
                    abund_by_rank.append(dict())
                taxon = ";".join(
                    taxonomy_clean[:depth + 1]
                )  # prevent bug with same sp name but with different ancestors
                if not abund_by_rank[depth].has_key(taxon):
                    abund_by_rank[depth][taxon] = 0
                abund_by_rank[depth][taxon] += count
    return nb_seq, full_tax_list, tax_list, abund_by_rank
Пример #45
0
def get_results(biom_file):
    """
    @summary: Returns the results of the affiliation.
    @param biom_file: [str] Path to a BIOM file after affiliation.
    @return: [dict] The global results and the sample results.
    """
    global_results = {
        "nb_clstr": 0,
        "nb_seq": 0,
        "nb_clstr_with_affi": 0,
        "nb_seq_with_affi": 0,
        "nb_clstr_ambiguous": list(),
        "nb_seq_ambiguous": list(),
    }
    samples_results = dict()

    biom = BiomIO.from_json(biom_file)
    for cluster in biom.get_observations():
        nb_seq = biom.get_observation_count(cluster["id"])
        global_results["nb_clstr"] += 1
        global_results["nb_seq"] += nb_seq
        if cluster["metadata"]["blast_taxonomy"] is not None:
            global_results["nb_clstr_with_affi"] += 1
            global_results["nb_seq_with_affi"] += nb_seq
            for depth, taxon in enumerate(
                    cluster["metadata"]["blast_taxonomy"]):
                if len(global_results["nb_clstr_ambiguous"]) < (depth + 1):
                    global_results["nb_clstr_ambiguous"].append(0)
                    global_results["nb_seq_ambiguous"].append(0)
                if taxon == "Multi-affiliation":
                    global_results["nb_clstr_ambiguous"][depth] += 1
                    global_results["nb_seq_ambiguous"][depth] += nb_seq
        # Samples results
        for sample in biom.get_samples_by_observation(cluster["id"]):
            sample_name = sample["id"]
            if not samples_results.has_key(sample_name):
                samples_results[sample_name] = {
                    "nb_clstr": 0,
                    "nb_seq": 0,
                    "nb_clstr_with_affi": 0,
                    "nb_seq_with_affi": 0
                }
            count = biom.get_count(cluster["id"], sample_name)
            if count > 0:
                samples_results[sample_name]["nb_clstr"] += 1
                samples_results[sample_name]["nb_seq"] += count
                if cluster["metadata"]["blast_taxonomy"] is not None:
                    samples_results[sample_name]["nb_clstr_with_affi"] += 1
                    samples_results[sample_name]["nb_seq_with_affi"] += count
    return global_results, samples_results
Пример #46
0
def excluded_obs_on_samplePresence(input_biom, min_sample_presence, excluded_file):
    """
    @summary: Writes the list of the observations present in an insufficient number of samples.
    @param input_biom: [str] The path to the BIOM file to check.
    @param min_sample_presence: [int] The observations present in a number of samples inferior than this value are reported in the excluded file.
    @param excluded_file: [str] The path to the output file.
    """
    biom = BiomIO.from_json( input_biom )
    FH_excluded_file = open( excluded_file, "w" )
    for observation_name in biom.get_observations_names():
        nb_samples = sum(1 for x in biom.get_samples_by_observation(observation_name))
        if nb_samples < min_sample_presence:
            FH_excluded_file.write( observation_name + "\n" )
    FH_excluded_file.close()
Пример #47
0
def get_results( biom_file ):
    """
    @summary: Returns the results of the affiliation.
    @param biom_file: [str] Path to a BIOM file after affiliation.
    @return: [dict] The global results and the sample results.
    """
    global_results = {
        "nb_clstr": 0,
        "nb_seq": 0,
        "nb_clstr_with_affi": 0,
        "nb_seq_with_affi": 0,
        "nb_clstr_ambiguous": list(),
        "nb_seq_ambiguous": list(),
    }
    samples_results = dict()

    biom = BiomIO.from_json( biom_file )
    for cluster in biom.get_observations():
        nb_seq = biom.get_observation_count( cluster["id"] )
        global_results["nb_clstr"] += 1
        global_results["nb_seq"] += nb_seq
        if cluster["metadata"]["blast_taxonomy"] is not None:
            global_results["nb_clstr_with_affi"] += 1
            global_results["nb_seq_with_affi"] += nb_seq
            for depth, taxon in enumerate(cluster["metadata"]["blast_taxonomy"]):
                if len(global_results["nb_clstr_ambiguous"]) < (depth + 1):
                    global_results["nb_clstr_ambiguous"].append( 0 )
                    global_results["nb_seq_ambiguous"].append( 0 )
                if taxon == "Multi-affiliation":
                    global_results["nb_clstr_ambiguous"][depth] += 1
                    global_results["nb_seq_ambiguous"][depth] += nb_seq
        # Samples results
        for sample in biom.get_samples_by_observation( cluster["id"] ):
            sample_name = sample["id"]
            if not samples_results.has_key( sample_name ):
                samples_results[sample_name] = {
                    "nb_clstr": 0,
                    "nb_seq": 0,
                    "nb_clstr_with_affi": 0,
                    "nb_seq_with_affi": 0
                }
            count = biom.get_count(cluster["id"], sample_name)
            if count > 0:
                samples_results[sample_name]["nb_clstr"] += 1
                samples_results[sample_name]["nb_seq"] += count
                if cluster["metadata"]["blast_taxonomy"] is not None:
                    samples_results[sample_name]["nb_clstr_with_affi"] += 1
                    samples_results[sample_name]["nb_seq_with_affi"] += count
    return global_results, samples_results
def get_alignment_distrib( input_biom, identity_tag, coverage_tag, multiple_tag ):
    """
    @summary: Returns by taxonomic rank the count (seq and clstr) for the different identity/coverage.
    @param input_biom: The path to the processed BIOM.
    @param identity_tag: The metadata tag used in BIOM file to store the alignment identity.
    @param coverage_tag: The metadata tag used in BIOM file to store the alignment query coverage.
    @param multiple_tag: The metadata tag used in BIOM file to store the list of possible taxonomies.
    @returns: [list] By taxonomic rank the count for the different identity/coverage.
              Example:
                [
                    [100, 100, { "clstr": 53, "seq": 20500 }],
                    [99, 100, { "clstr": 35, "seq": 18000 }],
                    [90, 95, { "clstr": 1, "seq": 10 }],
                ]
    """
    biom = BiomIO.from_json( input_biom )
    aln_results = list()
    aln_results_hash = dict()
    for observation in biom.get_observations():
        observation_metadata = observation['metadata']
        identity = None
        coverage = None
        if args.multiple_tag is not None:
            if observation_metadata.has_key(multiple_tag) and len(observation_metadata[multiple_tag]) > 0:
                identity = observation_metadata[multiple_tag][0][identity_tag]
                coverage = observation_metadata[multiple_tag][0][coverage_tag]
        else:
            if observation_metadata.has_key(identity_tag) and observation_metadata.has_key(coverage_tag):
                identity = observation_metadata[identity_tag]
                coverage = observation_metadata[coverage_tag]
        if identity is not None:
            if not aln_results_hash.has_key( identity ):
                aln_results_hash[identity] = dict()
            if not aln_results_hash[identity].has_key( coverage ):
                aln_results_hash[identity][coverage] = {
                    "clstr": 0,
                    "seq": 0
                }
            aln_results_hash[identity][coverage]["clstr"] += 1
            aln_results_hash[identity][coverage]["seq"] += biom.get_observation_count( observation['id'] )
    for ident in aln_results_hash.keys():
        for cover in aln_results_hash[ident].keys():
            aln_results.append([
                ident,
                cover,
                aln_results_hash[ident][cover]
            ])
    del biom
    return aln_results
Пример #49
0
def process( args ):
    tmp_files = TmpFiles( os.path.split(args.output_file)[0] )

    try:
        # Add temp taxonomy if multiple and without consensus
        tmp_biom = args.input_biom
        used_taxonomy_tag = args.taxonomy_tag
        if args.multiple_tag is not None:
            used_taxonomy_tag = args.tax_consensus_tag
            if args.tax_consensus_tag is None:
                used_taxonomy_tag = "Used_taxonomy_FROGS-affi"
                tmp_biom = tmp_files.add( "tax.biom" )
                biom = BiomIO.from_json( args.input_biom )
                for observation in biom.get_observations():
                    metadata = observation["metadata"]
                    if len(metadata[args.multiple_tag]) > 0:
                        metadata[used_taxonomy_tag] = metadata[args.multiple_tag][0][args.taxonomy_tag]
                BiomIO.write( tmp_biom, biom )
                del biom

        # Rarefaction
        tax_depth = [args.taxonomic_ranks.index(rank) for rank in args.rarefaction_ranks]
        rarefaction_cmd = Rarefaction(tmp_biom, tmp_files, used_taxonomy_tag, tax_depth)
        rarefaction_cmd.submit( args.log_file )
        rarefaction_files = rarefaction_cmd.output_files

        # Taxonomy tree
        tree_count_file = tmp_files.add( "taxCount.enewick" )
        tree_ids_file = tmp_files.add( "taxCount_ids.tsv" )
        TaxonomyTree(tmp_biom, used_taxonomy_tag, tree_count_file, tree_ids_file).submit( args.log_file )

        # Writes summary
        write_summary( args.output_file, args.input_biom, tree_count_file, tree_ids_file, rarefaction_files, args )
    finally:
        if not args.debug:
            tmp_files.deleteAll()
Пример #50
0
def excluded_obs_on_nBiggest(input_biom, nb_selected, excluded_file):
    """
    @summary: Writes the list of all the observations without the n most abundant.
    @param input_biom: [str] The path to the BIOM file.
    @param threshold: [float] The number of the most abundant observations that will not be written in the excluded list.
    @param excluded_file: [str] The path to the output file.
    """
    biom = BiomIO.from_json(input_biom)
    FH_excluded_file = open(excluded_file, "w")
    sorted_obs_counts = sorted(biom.get_observations_counts(),
                               key=lambda observation: observation[1],
                               reverse=True)
    for observation_name, observation_count in sorted_obs_counts[nb_selected:]:
        FH_excluded_file.write(observation_name + "\n")
    FH_excluded_file.close()
Пример #51
0
def get_alignment_distrib( input_biom, identity_tag, coverage_tag, multiple_tag ):
    """
    @summary: Returns by taxonomic rank the count (seq and clstr) for the different identity/coverage.
    @param input_biom: The path to the processed BIOM.
    @param identity_tag: The metadata tag used in BIOM file to store the alignment identity.
    @param coverage_tag: The metadata tag used in BIOM file to store the alignment query coverage.
    @param multiple_tag: The metadata tag used in BIOM file to store the list of possible taxonomies.
    @returns: [list] By taxonomic rank the count for the different identity/coverage.
              Example:
                [
                    [100, 100, { "clstr": 53, "seq": 20500 }],
                    [99, 100, { "clstr": 35, "seq": 18000 }],
                    [90, 95, { "clstr": 1, "seq": 10 }],
                ]
    """
    biom = BiomIO.from_json( input_biom )
    aln_results = list()
    aln_results_hash = dict()
    for observation in biom.get_observations():
        observation_metadata = observation['metadata']
        identity = 0
        coverage = 0
        if args.multiple_tag is not None:
            if observation_metadata.has_key(multiple_tag) and len(observation_metadata[multiple_tag]) > 0:
                identity = observation_metadata[multiple_tag][0][identity_tag]
                coverage = observation_metadata[multiple_tag][0][coverage_tag]
        else:
            if observation_metadata.has_key(identity_tag) and observation_metadata.has_key(coverage_tag):
                identity = observation_metadata[identity_tag]
                coverage = observation_metadata[coverage_tag]
        if not aln_results_hash.has_key( identity ):
            aln_results_hash[identity] = dict()
        if not aln_results_hash[identity].has_key( coverage ):
            aln_results_hash[identity][coverage] = {
                "clstr": 0,
                "seq": 0
            }
        aln_results_hash[identity][coverage]["clstr"] += 1
        aln_results_hash[identity][coverage]["seq"] += biom.get_observation_count( observation['id'] )
    for ident in aln_results_hash.keys():
        for cover in aln_results_hash[ident].keys():
            aln_results.append([
                ident,
                cover,
                aln_results_hash[ident][cover]
            ])
    del biom
    return aln_results
Пример #52
0
def get_bootstrap_distrib( input_biom, bootstrap_tag, multiple_tag ):
    """
    @summary: Returns by taxonomic rank the count (seq and clstr) for the different bootstrap categories.
    @param input_biom: The path to the processed BIOM.
    @param bootstrap_tag: The metadata tag used in BIOM file to store the taxonomy bootstraps.
    @param multiple_tag: The metadata tag used in BIOM file to store the list of possible taxonomies.
    @returns: [dict] By taxonomic rank the count for the different bootstrap categories.
              Example:
                {
                    "Phylum": {
                        "80": { "clstr": 1, "seq":100 },
                        "90": {    "clstr": 2,    "seq":400 },
                        "100": { "clstr": 50, "seq":20000 },
                    },
                    "Genus":{
                        "80":{ "clstr": 1, "seq":100 },
                        "90":{ "clstr": 2, "seq":400 },
                        "100":{ "clstr": 50, "seq":20000 },
                    }
                }
    """
    bootstrap_results = dict()

    biom = BiomIO.from_json( input_biom )
    for observation in biom.get_observations():
        observation_metadata = observation['metadata']
        bootstrap = None
        if multiple_tag is not None:
            if observation_metadata.has_key(multiple_tag) and len(observation_metadata[multiple_tag]) > 0:
                bootstrap = observation_metadata[multiple_tag][0][bootstrap_tag]
        else:
            if observation_metadata.has_key(bootstrap_tag):
                bootstrap = observation_metadata[bootstrap_tag]
        if bootstrap is not None:
            for taxonomy_depth, rank_bootstrap in enumerate( bootstrap ):
                rank_bootstrap = rank_bootstrap * 100
                rank = args.taxonomic_ranks[taxonomy_depth]
                if not bootstrap_results.has_key(rank):
                    bootstrap_results[rank] = dict()
                if not bootstrap_results[rank].has_key(rank_bootstrap):
                    bootstrap_results[rank][rank_bootstrap] = {
                        "clstr": 0,
                        "seq": 0
                    }
                bootstrap_results[rank][rank_bootstrap]["clstr"] += 1
                bootstrap_results[rank][rank_bootstrap]["seq"] += biom.get_observation_count( observation['id'] )
    del biom
    return bootstrap_results
Пример #53
0
def excluded_obs_on_samplePresence(input_biom, min_sample_presence,
                                   excluded_file):
    """
    @summary: Writes the list of the observations present in an insufficient number of samples.
    @param input_biom: [str] The path to the BIOM file to check.
    @param min_sample_presence: [int] The observations present in a number of samples inferior than this value are reported in the excluded file.
    @param excluded_file: [str] The path to the output file.
    """
    biom = BiomIO.from_json(input_biom)
    FH_excluded_file = open(excluded_file, "w")
    for observation_name in biom.get_observations_names():
        nb_samples = sum(
            1 for x in biom.get_samples_by_observation(observation_name))
        if nb_samples < min_sample_presence:
            FH_excluded_file.write(observation_name + "\n")
    FH_excluded_file.close()
Пример #54
0
def get_tree_with_count( input_biom, compress=False, taxonomy_key="taxonomy" ):
    """
    @summary: Returns the tree of taxa and their counts by sample from BIOM.
    @param input_biom: [str] Path to the BIOM file processed.
    @param compress: [bool] if true the samples names are replaced by samples index.
    @param taxonomy_key: [str] The metadata title for the taxonomy in biom.
    @return: [list] The tree generated and the ordered list of samples names (usefull to retrieve name by index if you use compress).
    """
    ordered_samples_names = list()
    tree = Node("root")
    biom = BiomIO.from_json( input_biom )
    for sample_name in biom.get_samples_names():
        ordered_samples_names.append( sample_name )
        sample_id = None if not compress else (len(ordered_samples_names)-1)
        update_tree_for_sample( biom, tree, sample_name, taxonomy_key, sample_id )
    return tree, ordered_samples_names
Пример #55
0
def get_tree_with_count( input_biom, compress=False, taxonomy_key="taxonomy" ):
    """
    @summary: Returns the tree of taxa and their counts by sample from BIOM.
    @param input_biom: [str] Path to the BIOM file processed.
    @param compress: [bool] if true the samples names are replaced by samples index.
    @param taxonomy_key: [str] The metadata title for the taxonomy in biom.
    @return: [list] The tree generated and the ordered list of samples names (usefull to retrieve name by index if you use compress).
    """
    ordered_samples_names = list()
    tree = Node("root")
    biom = BiomIO.from_json( input_biom )
    for sample_name in biom.get_samples_names():
        ordered_samples_names.append( sample_name )
        sample_id = None if not compress else (len(ordered_samples_names)-1)
        update_tree_for_sample( biom, tree, sample_name, taxonomy_key, sample_id )
    return tree, ordered_samples_names
Пример #56
0
def getCheckedAbunByRank( real_tax, input_biom, sample, taxonomy_key, multi_affiliation, duplication_groups ):
    """
    @summary:
    @param real_tax: [dict] Taxonomy by reference IDs.
    @param input_biom: [str] Path to BIOM file.
    @param sample: [str] sample name.
    @param taxonomy_key: [str] The metadata key for taxonomy.
    @param multi_affiliation: [bool] ************************************************************************************
    @param duplication_groups: [dict] By reference ID the list of IDs for references with the same sequence.
    @return: [dict] The dictionary of count by taxa in dictionary by rank.   
    """
    abund_by_rank = list()
    biom = BiomIO.from_json( input_biom )
    for observation in biom.get_observations():
        count = biom.get_count( observation["id"], sample )
        if count > 0:
            # Get taxonomy
            ref_id = observation["metadata"]["grinder_source"]
            taxonomy_clean = getCleanedTaxonomy(observation["metadata"][taxonomy_key])
            if not multi_affiliation: # Standard affiliation
                if not "," in ref_id: # Non chimera
                    if taxIsRetrieved(real_tax[ref_id], [taxonomy_clean]):
                        taxonomy_clean = real_tax[ref_id][0]
            else: # Multi-affiliation
                if not "," in ref_id: # Non chimera
                    subjects_ids = [affi["subject"] for affi in observation["metadata"]["blast_affiliations"]]
                    possible_taxonomies = [";".join(getCleanedTaxonomy(affi["taxonomy"])) for affi in observation["metadata"]["blast_affiliations"]]
                    # Manage ambiguity
                    if refIDIsRetrieved(ref_id, subjects_ids, duplication_groups):
                        taxonomy_clean = real_tax[ref_id][0]
                    elif len(subjects_ids) > 499 and taxIsRetrieved(real_tax[ref_id], possible_taxonomies):
                        taxonomy_clean = real_tax[ref_id][0]
                    elif "Multi-affiliation" in taxonomy_clean:
                        taxonomy_clean = getCleanedTaxonomy(observation["metadata"]["blast_affiliations"][0]["taxonomy"]) # Select one
                else: # Chimera
                    if "Multi-affiliation" in taxonomy_clean:
                        taxonomy_clean = getCleanedTaxonomy(observation["metadata"]["blast_affiliations"][0]["taxonomy"]) # Select one
            # Store count
            for depth in range(len(taxonomy_clean)):
                if len(abund_by_rank) < depth+1:
                    abund_by_rank.append(dict())
                taxon = ";".join( taxonomy_clean[:depth+1] ) # prevent bug with same sp name but with different ancestors
                if not abund_by_rank[depth].has_key(taxon):
                    abund_by_rank[depth][taxon] = 0
                abund_by_rank[depth][taxon] += count
    return abund_by_rank
Пример #57
0
def excluded_obs_on_rdpBootstrap(input_biom, taxonomic_depth, min_bootstrap, excluded_file):
    """
    @summary: Writes the list of the observations with an insufficient bootstrap on the specified taxonomic rank.
    @param input_biom: [str] The path to the BIOM file to check.
    @param taxonomic_depth: [int] The taxonomic rank depth to check (example: 6 for Species in system "Domain, Phylum, Class, Order, Family, Genus, Species").
    @param min_bootstrap: [float] The observations with a value inferior to this threshold at the specified taxonomic depth are reported in the excluded file.
    @param excluded_file: [str] The path to the output file.
    """
    biom = BiomIO.from_json( input_biom )
    FH_excluded_file = open( excluded_file, "w" )
    for observation in biom.get_observations():
        bootstrap = observation["metadata"]["rdp_bootstrap"]
        if issubclass(bootstrap.__class__, str):
            bootstrap = bootstrap.split(";")
        if bootstrap[taxonomic_depth] < min_bootstrap:
            FH_excluded_file.write( str(observation["id"]) + "\n" )
    FH_excluded_file.close()
Пример #58
0
def excluded_obs_on_abundance(input_biom, min_abundance, excluded_file):
    """
    @summary: Writes the list of the observations with an insufficient abundance.
    @param input_biom: [str] The path to the BIOM file to check.
    @param min_abundance: [int/float] The observations with an abundance inferior than this value are reported in the excluded file.
    @param excluded_file: [str] The path to the output file.
    """
    biom = BiomIO.from_json( input_biom )
    FH_excluded_file = open( excluded_file, "w" )
    min_nb_seq = min_abundance
    if type(min_abundance) == float:
        min_nb_seq = biom.get_total_count() * min_abundance
    for idx, count_by_sample in enumerate(biom.to_count()):
        observation = biom.rows[idx]
        abundance = sum(count_by_sample)
        if abundance < min_nb_seq:
            FH_excluded_file.write( str(observation["id"]) + "\n" )
    FH_excluded_file.close()
Пример #59
0
def biom_to_tsv( input_biom, output_tsv, fields, list_separator ):
    """
    @summary: Convert BIOM file to TSV file.
    @param input_biom: [str] Path to the BIOM file.
    @param output_tsv: [str] Path to the output file (format : TSV).
    @param fields: [list] Columns and their order in output. Special columns : '@observation_name', '@observation_sum', '@sample_count' '@rdp_tax_and_bootstrap' . The others columns must be metadata title.
    @param list_separator: [str] Separator for complex metadata.
    """
    biom = BiomIO.from_json( input_biom )
    out_fh = open( output_tsv, "w" )
    # Header
    header_parts = header_line_parts( fields, biom )
    out_fh.write( "#" + "\t".join(header_parts) + "\n" )
    # Data
    for obs_idx, count_by_sample in enumerate(biom.to_count()):
        observation_parts = observation_line_parts( biom.rows[obs_idx], count_by_sample, fields, list_separator )
        out_fh.write( "\t".join(observation_parts) + "\n" )
    out_fh.close()
Пример #60
0
def biom_fasta_update(biom_in, fasta_in, fasta_out, log_file):
    FH_in = FastaIO( fasta_in )
    FH_out = FastaIO( fasta_out, "w" )
    biom = BiomIO.from_json( biom_in )
    seq_in=0
    seq_out=0

    for record in FH_in:
        seq_in += 1
        try:
            biom.find_idx("observation",record.id)
        except ValueError:
            pass
        else:
            FH_out.write(record)
            seq_out += 1
    FH_in.close()
    FH_out.close()
    FH_log=open(log_file,"w")
    FH_log.write("Number of sequence in :" + str(seq_in)+"\n" )
    FH_log.write("Number of sequence out :" + str(seq_out) +"\n")