def sampling_by_sample( input_biom, output_biom, nb_sampled=None, sampled_ratio=None ):
    @summary: Writes a BIOM after a random sampling in each sample.
    @param input_biom: [str] Path to the processed BIOM.
    @param output_biom: [str] Path to outputed BIOM.
    @param nb_sampled: [int] Number of sampled sequences by sample.
    @param sampled_ratio: [float] Ratio of sampled sequences by sample.
    @note: nb_sampled and sampled_ratio are mutually exclusive.
    initial_biom = BiomIO.from_json( input_biom )
    new_biom = Biom(
                    generated_by="Sampling " + (str(nb_sampled) if nb_sampled is not None else str(sampled_ratio) + "%" ) + " elements by sample from " + input_biom
    observations_already_added = dict()
    for sample_name in initial_biom.get_samples_names():
        new_biom.add_sample( sample_name, initial_biom.get_sample_metadata(sample_name) )
        sample_seq = initial_biom.get_sample_count(sample_name)
        sample_nb_sampled = nb_sampled
        if nb_sampled is None:
            sample_nb_sampled = int(sample_seq * sampled_ratio)
        if sample_seq < nb_sampled:
            raise_exception( Exception( "\n\n#ERROR : " + str(sample_nb_sampled) + " sequences cannot be sampled in sample '" + str(sample_name) + "'. It only contains " + str(sample_seq) + " sequences.\n\n" ))
            for current_nb_iter in range(sample_nb_sampled):
                # Take an observation in initial BIOM
                selected_observation = initial_biom.random_obs_by_sample(sample_name)
                selected_observation_id = selected_observation['id']
                initial_biom.subtract_count( selected_observation_id, sample_name, 1 )
                # Put in new BIOM
                if selected_observation_id not in observations_already_added:
                    new_biom.add_observation( selected_observation_id, initial_biom.get_observation_metadata(selected_observation_id) )
                    observations_already_added[selected_observation_id] = True
                new_biom.add_count( selected_observation_id, sample_name, 1 )
    BiomIO.write( output_biom, new_biom )
def to_biom(clusters_file, count_file, output_biom, size_separator):
    @summary : Write a biom file from swarm results.
    @param clusters_file : [str] path to the '.clstr' file.
    @param count_file : [str] path to the count file. It contains the count of
                         sequences by sample of each preclusters.
                         Line format : "Precluster_id    nb_in_sampleA    nb_in_sampleB"
    @param output_biom : [str] path to the output file.
    @param size_separator : [str] the pre-cluster abundance separator.
    biom = Biom(generated_by='swarm', matrix_type="sparse")

    # Preclusters count by sample
    preclusters_count = dict()
    count_fh = open(count_file)
    samples = count_fh.readline().strip().split()[1:]
    for line in count_fh:
        line_fields = line.strip().split()
        count_by_sample = {}
        for idx, val in enumerate(line_fields[1:]):
            if val > 0:
                count_by_sample[samples[idx]] = int(val)
        preclusters_count[line_fields[0]] = count_by_sample

    # Add samples
    for sample_name in samples:

    # Process count
    cluster_idx = 1
    clusters_fh = open(clusters_file)
    for line in clusters_fh:
        cluster_name = "Cluster_" + str(cluster_idx)
        cluster_count = {key: 0 for key in samples}
        line_fields = line.strip().split()
        # Retrieve count by sample
        for seq_id in line_fields:
            real_seq_id = seq_id.rsplit(size_separator, 1)[0]
            for preclust_sample in preclusters_count[real_seq_id]:
                cluster_count[preclust_sample] += preclusters_count[
            preclusters_count[real_seq_id] = None
        # Add cluster on biom
            {'seed_id': line_fields[0].rsplit(size_separator, 1)[0]})
        for sample_name in samples:
            if cluster_count[sample_name] > 0:
                biom.add_count(cluster_name, sample_name,
        # Next cluster
        cluster_idx += 1

    # Write
    BiomIO.write(output_biom, biom)
def tsv_to_biom( input_tsv, multi_hit_dict, fields, samples_names, output_biom, output_fasta ):
    @summary: Convert TSV file to Biom file.
    @param input_tsv: [str] Path to the TSV file.
    @param multi_hit_dict: [dict] Dictionnary describing equivalent multi blast hit : 
    dict[observation_name]=[ {"blast_taxonomy":taxonomy, "blast_subject":subject, "blast_perc_identity": per_id, "blast_perc_query_coverage":per_cov, "blast_evalue":eval, "blast_aln_length":aln}]
    @param fields: [list] column name to include as metadata (must at least contain observation_name): observation_sum and seed_sequence will be excluded, rdp_tax_and_bootstrap will be split in two metadata
    @param samples_names: [list] list of sample names.
    @param output_biom: [str] Path to the output file (format : BIOM).
    @param output_fasta: [str] Path to the output file (format : fasta).
#     biom = Biom( generated_by='frogs', matrix_type="sparse" )
    biom = Biom( matrix_type="sparse" )

    seed_seq_idx = -1 
    metadata_index = dict()
    sample_index = dict()
    clusters_count = dict()
    clusters_metadata = dict()
    in_fh = open( input_tsv )

    if not output_fasta is None:
        Fasta_fh=FastaIO(output_fasta , "w" )

    # parse header and store column index 
    if header.startswith("#"):
    header = header.strip()
    seed_seq_idx, metadata_index, sample_index = header_line_dict(fields,header,samples_names)
    if not output_fasta is None and seed_seq_idx == -1:
        raise Exception("\nYou want to extract seed fasta sequence but there is no seed_sequence column in your TSV file\n\n")

    # count by sample, and metadata
    for line in in_fh:

        count_by_sample = {}
        metadata_dict = {}
        # parse columns
        for idx,val in enumerate(line_list):
            # recover metadata
            if idx in metadata_index:
                if metadata_index[idx]=="observation_name" :
                    cluster_name = val
                    metadata_dict[metadata_index[idx]] = val
            # recover samples count
            elif idx in sample_index and val > 0:
                count_by_sample[sample_index[idx]] = int(val)
            # recover seed sequence
            elif idx == seed_seq_idx:
                seed_seq = val

        # if fasta output file => store de seed sequence
        if not output_fasta is None:
            seq = Sequence( cluster_name, seed_seq) 

        if "taxonomy" in metadata_dict:
            metadata_dict["taxonomy"] = metadata_dict["taxonomy"].split(";")

        # format rdp taxonomy to fit BIOM format
        if "rdp_tax_and_bootstrap" in metadata_dict:
            tax = metadata_dict["rdp_tax_and_bootstrap"].rstrip(";").split(";")
            for i in range(0,len(tax),2):

        # format blast taxonomy to fit BIOM format (one consensus blast_taxonomy and possible multiples blast_affiliation detailed
        if "blast_taxonomy" in metadata_dict:
            metadata_dict["blast_taxonomy"] = metadata_dict["blast_taxonomy"].split(";")

            # check multihit blast : filter non consistent taxonomy hit with blast_taxonomy (if TSV modified), and compute consensus tax (if multihit line suppressed)
            if metadata_dict["blast_subject"] == "multi-subject" and not multi_hit_dict is None:
                if not cluster_name in multi_hit_dict:
                    raise Exception("\n"+cluster_name+" has multi-subject tag but is not present in your multi-hit TSV file. Please, provide the original multi-hit TSV file.\n\n")
                    metadata_dict["blast_taxonomy"], metadata_dict["blast_affiliations"] = observation_blast_parts(metadata_dict, multi_hit_dict[cluster_name])
                    if metadata_dict["blast_affiliations"] == []:
                        raise Exception("\nyour multihit TSV file is no more consistent with your abundance TSV file for (at least) "+cluster_name+"\n\n")
            # no multi tag= blast affiliation is equal to blast_taxonomy
                blast_dict={key.replace("blast_",""):metadata_dict[key] for key in metadata_dict if key.startswith("blast")}

            # filter blast metadata which are moved to blast_affiliations
            for metadata in metadata_dict["blast_affiliations"][0]:
                if not metadata == "taxonomy":

        # add cluster and count to clusters_count dict
        clusters_count[cluster_name] = count_by_sample
        # ok print clusters_count[cluster_name].keys(), "CDT0#LOT05" in clusters_count[cluster_name], "CDT0#LOT02" in clusters_count[cluster_name]
        # add cluster and metadata to clusters_metadata dict
        clusters_metadata[cluster_name] = metadata_dict

    if not output_fasta is None:

    #add samples to biom
    for sample_name in samples_names:
        biom.add_sample( sample_name )

    # add to cluster to biom
    for cluster_name in clusters_count:
        biom.add_observation( cluster_name, clusters_metadata[cluster_name] )
        for sample_name in samples_names:
            if clusters_count[cluster_name][sample_name] > 0:
                biom.add_count( cluster_name, sample_name, clusters_count[cluster_name][sample_name] )

    # Write
    BiomIO.write( output_biom, biom )