def to_biom( clusters_file, count_file, output_biom, size_separator ): """ @summary : Write a biom file from swarm results. @param clusters_file : [str] path to the '.clstr' file. @param count_file : [str] path to the count file. It contains the count of sequences by sample of each preclusters. Line format : "Precluster_id nb_in_sampleA nb_in_sampleB" @param output_biom : [str] path to the output file. @param size_separator : [str] the pre-cluster abundance separator. """ biom = Biom( generated_by='swarm', matrix_type="sparse" ) # Preclusters count by sample preclusters_count = dict() count_fh = open( count_file ) samples = count_fh.readline().strip().split()[1:] for line in count_fh: precluster_id, count_str = line.strip().split(None, 1) preclusters_count[precluster_id] = count_str # For large dataset store count into a string consumes minus RAM than a sparse count count_fh.close() # Add samples for sample_name in samples: biom.add_sample( sample_name ) # Process count cluster_idx = 1 clusters_fh = open( clusters_file ) for line in clusters_fh: seed_id = line.strip().split()[0] if "FROGS_combined" in seed_id: cluster_name = "Cluster_" + str(cluster_idx) + "_FROGS_combined" comment = "WARNING" else: cluster_name = "Cluster_" + str(cluster_idx) comment = "na" cluster_count = {key:0 for key in samples} line_fields = line.strip().split() # Retrieve count by sample for seq_id in line_fields: real_seq_id = seq_id.rsplit(size_separator, 1)[0] sample_counts = preclusters_count[real_seq_id].split() for sample_idx, sample_name in enumerate(samples): cluster_count[sample_name] += int(sample_counts[sample_idx]) preclusters_count[real_seq_id] = None # Add cluster on biom biom.add_observation( cluster_name, {'comment': comment, 'seed_id':line_fields[0].rsplit(size_separator, 1)[0]} ) observation_idx = biom.find_idx("observation", cluster_name) for sample_idx, sample_name in enumerate(samples): if cluster_count[sample_name] > 0: biom.data.change( observation_idx, sample_idx, cluster_count[sample_name] ) # Next cluster cluster_idx += 1 # Write BiomIO.write( output_biom, biom )
def to_biom( clusters_file, count_file, output_biom, size_separator ): """ @summary : Write a biom file from swarm results. @param clusters_file : [str] path to the '.clstr' file. @param count_file : [str] path to the count file. It contains the count of sequences by sample of each preclusters. Line format : "Precluster_id nb_in_sampleA nb_in_sampleB" @param output_biom : [str] path to the output file. @param size_separator : [str] the pre-cluster abundance separator. """ biom = Biom( generated_by='swarm', matrix_type="sparse" ) # Preclusters count by sample preclusters_count = dict() count_fh = open( count_file ) samples = count_fh.readline().strip().split()[1:] for line in count_fh: precluster_id, count_str = line.strip().split(None, 1) preclusters_count[precluster_id] = count_str # For large dataset store count into a string consumes minus RAM than a sparse count count_fh.close() # Add samples for sample_name in samples: biom.add_sample( sample_name ) # Process count cluster_idx = 1 clusters_fh = open( clusters_file ) for line in clusters_fh: cluster_name = "Cluster_" + str(cluster_idx) cluster_count = {key:0 for key in samples} line_fields = line.strip().split() # Retrieve count by sample for seq_id in line_fields: real_seq_id = seq_id.rsplit(size_separator, 1)[0] sample_counts = preclusters_count[real_seq_id].split() for sample_idx, sample_name in enumerate(samples): cluster_count[sample_name] += int(sample_counts[sample_idx]) preclusters_count[real_seq_id] = None # Add cluster on biom biom.add_observation( cluster_name, {'seed_id':line_fields[0].rsplit(size_separator, 1)[0]} ) observation_idx = biom.find_idx("observation", cluster_name) for sample_idx, sample_name in enumerate(samples): if cluster_count[sample_name] > 0: biom.data.change( observation_idx, sample_idx, cluster_count[sample_name] ) # Next cluster cluster_idx += 1 # Write BiomIO.write( output_biom, biom )