def choose_cluster_subsets(otu_table_f, map_f, category, num_total_samples): otu_table = parse_biom_table(otu_table_f) metadata_map = MetadataMap.parseMetadataMap(map_f) # Dirty... :( try: map_f.seek(0) except AttributeError: pass if num_total_samples > len(otu_table.SampleIds): raise InvalidSubsetSize("Too many total samples (%d) were specified " "as a subset size. There are only %d total " "samples to choose a subset from." % (num_total_samples, len(otu_table.SampleIds))) category_map = defaultdict(list) for samp_id in metadata_map.SampleIds: # Mapping files can have more samples than OTU tables. if samp_id in otu_table.SampleIds: category_val = metadata_map.getCategoryValue(samp_id, category) category_map[category_val].append(samp_id) samp_ids_to_keep, extra_samps = _choose_items_from_clusters( category_map, otu_table.SampleIds, num_total_samples) samp_ids_to_keep.extend(extra_samps) assert len(samp_ids_to_keep) == num_total_samples, \ "%d != %d" % (len(samp_ids_to_keep), num_total_samples) assert len(samp_ids_to_keep) == len(set(samp_ids_to_keep)), \ "Duplicate sample IDs in subset" return (filter_samples_from_otu_table(otu_table, samp_ids_to_keep, 0, inf), filter_mapping_file_from_mapping_f(map_f, samp_ids_to_keep))
def choose_gradient_subset(otu_table_f, map_f, category, num_total_samples): otu_table = parse_biom_table(otu_table_f) mdm, _ = parse_mapping_file_to_dict(map_f) try: map_f.seek(0) except AttributeError: pass if num_total_samples > len(otu_table.SampleIds): raise InvalidSubsetSize("Too many total samples (%d) were specified " "as a gradient subset size. There are only %d " "total samples to choose a subset from." % (num_total_samples, len(otu_table.SampleIds))) # Only keep the sample IDs that are in both the mapping file and OTU table. # Sort the samples according to the gradient category. samp_ids = [(samp_id, float(metadata[category])) for samp_id, metadata in mdm.items() if samp_id in otu_table.SampleIds] samp_ids.sort(key=lambda samp_id: samp_id[1]) samp_ids_to_keep = [samp_id[0] for samp_id in _choose_items_from_bins(samp_ids, num_total_samples)] assert len(samp_ids_to_keep) == num_total_samples, \ "%d != %d" % (len(samp_ids_to_keep), num_total_samples) assert len(samp_ids_to_keep) == len(set(samp_ids_to_keep)), \ "Duplicate sample IDs in subset" return (filter_samples_from_otu_table(otu_table, samp_ids_to_keep, 0, inf), filter_mapping_file_from_mapping_f(map_f, samp_ids_to_keep))