Пример #1
0
def sampling_by_sample( input_biom, output_biom, nb_sampled=None, sampled_ratio=None ):
    """
    @summary: Writes a BIOM after a random sampling in each sample.
    @param input_biom: [str] Path to the processed BIOM.
    @param output_biom: [str] Path to outputed BIOM.
    @param nb_sampled: [int] Number of sampled sequences by sample.
    @param sampled_ratio: [float] Ratio of sampled sequences by sample.
    @note: nb_sampled and sampled_ratio are mutually exclusive.
    """
    initial_biom = BiomIO.from_json( input_biom )
    new_biom = Biom(
                    matrix_type="sparse",
                    generated_by="Sampling " + (str(nb_sampled) if nb_sampled is not None else str(sampled_ratio) + "%" ) + " elements by sample from " + input_biom
    )
    observations_already_added = dict()
    for sample_name in initial_biom.get_samples_names():
        new_biom.add_sample( sample_name, initial_biom.get_sample_metadata(sample_name) )
        sample_seq = initial_biom.get_sample_count(sample_name)
        sample_nb_sampled = nb_sampled
        if nb_sampled is None:
            sample_nb_sampled = int(sample_seq * sampled_ratio)
        if sample_seq < nb_sampled:
            raise_exception( Exception( "\n\n#ERROR : " + str(sample_nb_sampled) + " sequences cannot be sampled in sample '" + str(sample_name) + "'. It only contains " + str(sample_seq) + " sequences.\n\n" ))
        else:
            for current_nb_iter in range(sample_nb_sampled):
                # Take an observation in initial BIOM
                selected_observation = initial_biom.random_obs_by_sample(sample_name)
                selected_observation_id = selected_observation['id']
                initial_biom.subtract_count( selected_observation_id, sample_name, 1 )
                # Put in new BIOM
                if selected_observation_id not in observations_already_added:
                    new_biom.add_observation( selected_observation_id, initial_biom.get_observation_metadata(selected_observation_id) )
                    observations_already_added[selected_observation_id] = True
                new_biom.add_count( selected_observation_id, sample_name, 1 )
    BiomIO.write( output_biom, new_biom )
Пример #2
0
def to_biom(clusters_file, count_file, output_biom, size_separator):
    """
    @summary : Write a biom file from swarm results.
    @param clusters_file : [str] path to the '.clstr' file.
    @param count_file : [str] path to the count file. It contains the count of
                         sequences by sample of each preclusters.
                         Line format : "Precluster_id    nb_in_sampleA    nb_in_sampleB"
    @param output_biom : [str] path to the output file.
    @param size_separator : [str] the pre-cluster abundance separator.
    """
    biom = Biom(generated_by='swarm', matrix_type="sparse")

    # Preclusters count by sample
    preclusters_count = dict()
    count_fh = open(count_file)
    samples = count_fh.readline().strip().split()[1:]
    for line in count_fh:
        line_fields = line.strip().split()
        count_by_sample = {}
        for idx, val in enumerate(line_fields[1:]):
            if val > 0:
                count_by_sample[samples[idx]] = int(val)
        preclusters_count[line_fields[0]] = count_by_sample
    count_fh.close()

    # Add samples
    for sample_name in samples:
        biom.add_sample(sample_name)

    # Process count
    cluster_idx = 1
    clusters_fh = open(clusters_file)
    for line in clusters_fh:
        cluster_name = "Cluster_" + str(cluster_idx)
        cluster_count = {key: 0 for key in samples}
        line_fields = line.strip().split()
        # Retrieve count by sample
        for seq_id in line_fields:
            real_seq_id = seq_id.rsplit(size_separator, 1)[0]
            for preclust_sample in preclusters_count[real_seq_id]:
                cluster_count[preclust_sample] += preclusters_count[
                    real_seq_id][preclust_sample]
            preclusters_count[real_seq_id] = None
        # Add cluster on biom
        biom.add_observation(
            cluster_name,
            {'seed_id': line_fields[0].rsplit(size_separator, 1)[0]})
        for sample_name in samples:
            if cluster_count[sample_name] > 0:
                biom.add_count(cluster_name, sample_name,
                               cluster_count[sample_name])
        # Next cluster
        cluster_idx += 1

    # Write
    BiomIO.write(output_biom, biom)
Пример #3
0
def sampling_by_sample( input_biom, output_biom, nb_sampled=None, sampled_ratio=None ):
    """
    @summary: Writes a BIOM after a random sampling in each sample.
    @param input_biom: [str] Path to the processed BIOM.
    @param output_biom: [str] Path to outputed BIOM.
    @param nb_sampled: [int] Number of sampled sequences by sample.
    @param sampled_ratio: [float] Ratio of sampled sequences by sample.
    @note: nb_sampled and sampled_ratio are mutually exclusive.
    """
    initial_biom = BiomIO.from_json( input_biom )
    new_biom = Biom(
                    matrix_type="sparse",
                    generated_by="Sampling " + (str(nb_sampled) if nb_sampled is not None else str(sampled_ratio) + "%" ) + " elements by sample from " + input_biom
    )
    observations_already_added = dict()
    for sample_name in initial_biom.get_samples_names():
        new_biom.add_sample( sample_name, initial_biom.get_sample_metadata(sample_name) )
        sample_seq = initial_biom.get_sample_count(sample_name)
        sample_nb_sampled = nb_sampled
        if nb_sampled is None:
            sample_nb_sampled = int(sample_seq * sampled_ratio)
        if sample_seq < nb_sampled:
            raise Exception( str(sample_nb_sampled) + " sequences cannot be sampled in sample '" + str(sample_name) + "'. It only contains " + str(sample_seq) + " sequences." )
        else:
            for current_nb_iter in range(sample_nb_sampled):
                # Take an observation in initial BIOM
                selected_observation = initial_biom.random_obs_by_sample(sample_name)
                selected_observation_id = selected_observation['id']
                initial_biom.subtract_count( selected_observation_id, sample_name, 1 )
                # Put in new BIOM
                if not observations_already_added.has_key(selected_observation_id):
                    new_biom.add_observation( selected_observation_id, initial_biom.get_observation_metadata(selected_observation_id) )
                    observations_already_added[selected_observation_id] = True
                new_biom.add_count( selected_observation_id, sample_name, 1 )
    BiomIO.write( output_biom, new_biom )
Пример #4
0
def to_biom( clusters_file, count_file, output_biom, size_separator ):
    """
    @summary : Write a biom file from swarm results.
    @param clusters_file : [str] path to the '.clstr' file.
    @param count_file : [str] path to the count file. It contains the count of
                         sequences by sample of each preclusters.
                         Line format : "Precluster_id    nb_in_sampleA    nb_in_sampleB"
    @param output_biom : [str] path to the output file.
    @param size_separator : [str] the pre-cluster abundance separator.
    """
    biom = Biom( generated_by='swarm', matrix_type="sparse" )

    # Preclusters count by sample
    preclusters_count = dict()
    count_fh = open( count_file )
    samples = count_fh.readline().strip().split()[1:]
    for line in count_fh:
        precluster_id, count_str = line.strip().split(None, 1)
        preclusters_count[precluster_id] = count_str # For large dataset store count into a string consumes minus RAM than a sparse count
    count_fh.close()

    # Add samples
    for sample_name in samples:
        biom.add_sample( sample_name )

    # Process count
    cluster_idx = 1
    clusters_fh = open( clusters_file )
    for line in clusters_fh:
        seed_id = line.strip().split()[0]
        if "FROGS_combined" in seed_id:
            cluster_name = "Cluster_" + str(cluster_idx) + "_FROGS_combined"
            comment = "WARNING"
        else:
            cluster_name = "Cluster_" + str(cluster_idx)
            comment = "na"
        cluster_count = {key:0 for key in samples}
        line_fields = line.strip().split()
        # Retrieve count by sample
        for seq_id in line_fields:
            real_seq_id = seq_id.rsplit(size_separator, 1)[0]
            sample_counts = preclusters_count[real_seq_id].split()
            for sample_idx, sample_name in enumerate(samples):
                cluster_count[sample_name] += int(sample_counts[sample_idx])
            preclusters_count[real_seq_id] = None
        # Add cluster on biom
        biom.add_observation( cluster_name, {'comment': comment, 'seed_id':line_fields[0].rsplit(size_separator, 1)[0]} )
        observation_idx = biom.find_idx("observation", cluster_name)
        for sample_idx, sample_name in enumerate(samples):
            if cluster_count[sample_name] > 0:
                biom.data.change( observation_idx, sample_idx, cluster_count[sample_name] )
        # Next cluster
        cluster_idx += 1

    # Write
    BiomIO.write( output_biom, biom )
Пример #5
0
def to_biom( clusters_file, count_file, output_biom, size_separator ):
    """
    @summary : Write a biom file from swarm results.
    @param clusters_file : [str] path to the '.clstr' file.
    @param count_file : [str] path to the count file. It contains the count of
                         sequences by sample of each preclusters.
                         Line format : "Precluster_id    nb_in_sampleA    nb_in_sampleB"
    @param output_biom : [str] path to the output file.
    @param size_separator : [str] the pre-cluster abundance separator.
    """
    biom = Biom( generated_by='swarm', matrix_type="sparse" )

    # Preclusters count by sample
    preclusters_count = dict()
    count_fh = open( count_file )
    samples = count_fh.readline().strip().split()[1:]
    for line in count_fh:
        precluster_id, count_str = line.strip().split(None, 1)
        preclusters_count[precluster_id] = count_str # For large dataset store count into a string consumes minus RAM than a sparse count
    count_fh.close()

    # Add samples
    for sample_name in samples:
        biom.add_sample( sample_name )

    # Process count
    cluster_idx = 1
    clusters_fh = open( clusters_file )
    for line in clusters_fh:
        cluster_name = "Cluster_" + str(cluster_idx)
        cluster_count = {key:0 for key in samples}
        line_fields = line.strip().split()
        # Retrieve count by sample
        for seq_id in line_fields:
            real_seq_id = seq_id.rsplit(size_separator, 1)[0]
            sample_counts = preclusters_count[real_seq_id].split()
            for sample_idx, sample_name in enumerate(samples):
                cluster_count[sample_name] += int(sample_counts[sample_idx])
            preclusters_count[real_seq_id] = None
        # Add cluster on biom
        biom.add_observation( cluster_name, {'seed_id':line_fields[0].rsplit(size_separator, 1)[0]} )
        observation_idx = biom.find_idx("observation", cluster_name)
        for sample_idx, sample_name in enumerate(samples):
            if cluster_count[sample_name] > 0:
                biom.data.change( observation_idx, sample_idx, cluster_count[sample_name] )
        # Next cluster
        cluster_idx += 1

    # Write
    BiomIO.write( output_biom, biom )
Пример #6
0
##################################################################################################################################################
if __name__ == "__main__":
    # Manage parameters
    parser = argparse.ArgumentParser( description='Write a BIOM from grinder count profile.' )
    parser.add_argument( '-v', '--version', action='version', version=__version__ )
    # Inputs
    group_input = parser.add_argument_group( 'Inputs' )
    group_input.add_argument( '-s', '--samples', type=str, action=SampleParameter, metavar=("SAMPLE_NAME:SAMPLE_PATH"), nargs='+', help="Samples names and grinder rank files." )
    group_input.add_argument( '-a', '--affiliation', required=True, help='Path to the databank source for simulated sequence (format: fasta). The description of sequences must be the taxonomy.' )
    # Outputs
    group_output = parser.add_argument_group( 'Outputs' )
    group_output.add_argument( '-o', '--output', required=True, help='The output BIOM (format: BIOM).' )
    args = parser.parse_args()

    taxonomy_key = "real_taxonomy"
    biom = Biom( generated_by="grinder", matrix_type="sparse" )

    # Set observations count
    for sample_name in args.samples:
        biom.add_sample( sample_name )
        fh_abund = open( args.samples[sample_name] )
        for line in fh_abund: # Content format: "# rank<TAB>seq_id<TAB>rel_abund_perc"
            if not line.startswith('#'):
                fields = line.strip().split()
                try:
                    biom.add_observation( fields[1] )
                except: # already exist
                    pass
                biom.change_count( fields[1], sample_name, int(float(fields[2])*100000000000000) )################## depend de la precision grinder
        fh_abund.close()
Пример #7
0
def tsv_to_biom( input_tsv, multi_hit_dict, fields, samples_names, output_biom, output_fasta ):
    """
    @summary: Convert TSV file to Biom file.
    @param input_tsv: [str] Path to the TSV file.
    @param multi_hit_dict: [dict] Dictionnary describing equivalent multi blast hit : 
    dict[observation_name]=[ {"blast_taxonomy":taxonomy, "blast_subject":subject, "blast_perc_identity": per_id, "blast_perc_query_coverage":per_cov, "blast_evalue":eval, "blast_aln_length":aln}]
    @param fields: [list] column name to include as metadata (must at least contain observation_name): observation_sum and seed_sequence will be excluded, rdp_tax_and_bootstrap will be split in two metadata
    @param samples_names: [list] list of sample names.
    @param output_biom: [str] Path to the output file (format : BIOM).
    @param output_fasta: [str] Path to the output file (format : fasta).
    """
#     biom = Biom( generated_by='frogs', matrix_type="sparse" )
    biom = Biom( matrix_type="sparse" )

    seed_seq_idx = -1 
    metadata_index = dict()
    sample_index = dict()
    clusters_count = dict()
    clusters_metadata = dict()
    in_fh = open( input_tsv )

    if not output_fasta is None:
        Fasta_fh=FastaIO(output_fasta , "w" )

    # parse header and store column index 
    header=in_fh.readline()
    if header.startswith("#"):
        header=header[1:]
    header = header.strip()
    seed_seq_idx, metadata_index, sample_index = header_line_dict(fields,header,samples_names)
    if not output_fasta is None and seed_seq_idx == -1:
        raise Exception("\nYou want to extract seed fasta sequence but there is no seed_sequence column in your TSV file\n\n")

    # count by sample, and metadata
    for line in in_fh:

        cluster_name=""
        line_list=line.strip().split("\t")
        count_by_sample = {}
        metadata_dict = {}
        # parse columns
        for idx,val in enumerate(line_list):
            # recover metadata
            if idx in metadata_index:
                if metadata_index[idx]=="observation_name" :
                    cluster_name = val
                else:
                    metadata_dict[metadata_index[idx]] = val
            # recover samples count
            elif idx in sample_index and val > 0:
                count_by_sample[sample_index[idx]] = int(val)
            # recover seed sequence
            elif idx == seed_seq_idx:
                seed_seq = val

        # if fasta output file => store de seed sequence
        if not output_fasta is None:
            seq = Sequence( cluster_name, seed_seq) 
            Fasta_fh.write(seq)

        if "taxonomy" in metadata_dict:
            metadata_dict["taxonomy"] = metadata_dict["taxonomy"].split(";")

        # format rdp taxonomy to fit BIOM format
        if "rdp_tax_and_bootstrap" in metadata_dict:
            metadata_dict["rdp_taxonomy"]=[]
            metadata_dict["rdp_bootstrap"]=[]
            tax = metadata_dict["rdp_tax_and_bootstrap"].rstrip(";").split(";")
            for i in range(0,len(tax),2):
                metadata_dict["rdp_taxonomy"].append(tax[i])
                metadata_dict["rdp_bootstrap"].append(tax[i+1].replace("(","").replace(")",""))
            metadata_dict.pop("rdp_tax_and_bootstrap")

        # format blast taxonomy to fit BIOM format (one consensus blast_taxonomy and possible multiples blast_affiliation detailed
        if "blast_taxonomy" in metadata_dict:
            metadata_dict["blast_taxonomy"] = metadata_dict["blast_taxonomy"].split(";")

            # check multihit blast : filter non consistent taxonomy hit with blast_taxonomy (if TSV modified), and compute consensus tax (if multihit line suppressed)
            if metadata_dict["blast_subject"] == "multi-subject" and not multi_hit_dict is None:
                if not cluster_name in multi_hit_dict:
                    raise Exception("\n"+cluster_name+" has multi-subject tag but is not present in your multi-hit TSV file. Please, provide the original multi-hit TSV file.\n\n")
                else:
                    metadata_dict["blast_taxonomy"], metadata_dict["blast_affiliations"] = observation_blast_parts(metadata_dict, multi_hit_dict[cluster_name])
                    if metadata_dict["blast_affiliations"] == []:
                        raise Exception("\nyour multihit TSV file is no more consistent with your abundance TSV file for (at least) "+cluster_name+"\n\n")
            # no multi tag= blast affiliation is equal to blast_taxonomy
            else:
                blast_dict={key.replace("blast_",""):metadata_dict[key] for key in metadata_dict if key.startswith("blast")}
                metadata_dict["blast_affiliations"]=[blast_dict]

            # filter blast metadata which are moved to blast_affiliations
            for metadata in metadata_dict["blast_affiliations"][0]:
                if not metadata == "taxonomy":
                    metadata_dict.pop("blast_"+metadata)

        # add cluster and count to clusters_count dict
        clusters_count[cluster_name] = count_by_sample
        # ok print clusters_count[cluster_name].keys(), "CDT0#LOT05" in clusters_count[cluster_name], "CDT0#LOT02" in clusters_count[cluster_name]
        # add cluster and metadata to clusters_metadata dict
        clusters_metadata[cluster_name] = metadata_dict

    if not output_fasta is None:
        Fasta_fh.close()
    in_fh.close()

    #add samples to biom
    for sample_name in samples_names:
        biom.add_sample( sample_name )

    # add to cluster to biom
    for cluster_name in clusters_count:
        biom.add_observation( cluster_name, clusters_metadata[cluster_name] )
        for sample_name in samples_names:
            if clusters_count[cluster_name][sample_name] > 0:
                biom.add_count( cluster_name, sample_name, clusters_count[cluster_name][sample_name] )

    # Write
    BiomIO.write( output_biom, biom )
Пример #8
0
        '-a',
        '--affiliation',
        required=True,
        help=
        'Path to the databank source for simulated sequence (format: fasta). The description of sequences must be the taxonomy.'
    )
    # Outputs
    group_output = parser.add_argument_group('Outputs')
    group_output.add_argument('-o',
                              '--output',
                              required=True,
                              help='The output BIOM (format: BIOM).')
    args = parser.parse_args()

    taxonomy_key = "real_taxonomy"
    biom = Biom(generated_by="grinder", matrix_type="sparse")

    # Set observations count
    for sample_name in args.samples:
        biom.add_sample(sample_name)
        fh_abund = open(args.samples[sample_name])
        for line in fh_abund:  # Content format: "# rank<TAB>seq_id<TAB>rel_abund_perc"
            if not line.startswith('#'):
                fields = line.strip().split()
                try:
                    biom.add_observation(fields[1])
                except:  # already exist
                    pass
                biom.change_count(
                    fields[1], sample_name,
                    int(float(fields[2]) * 100000000000000