def test_filter_samples_from_otu_table(self): """filter_samples_from_otu_table functions as expected """ actual = filter_samples_from_otu_table(self.input_otu_table1, ["DEF","GHI tfasd"]) self.assertEqual(actual,self.expected_otu_table1c) # order of otu table is retained regardless of samples_to_keep order actual = filter_samples_from_otu_table(self.input_otu_table1, ["XYZ"]) self.assertEqual(actual,self.expected_otu_table1d)
def get_rare_data(otu_table, seqs_per_sample, include_small_samples=False, subsample_f=subsample): """Filter OTU table to keep only desired sample sizes. - include_small_sampes=False => do not write samples with < seqs_per_sample total sequecnes - otu_table (input and out) is otus(rows) by samples (cols) - no otus are removed, even if they are absent in the rarefied table""" if not include_small_samples: otu_table = filter_samples_from_otu_table(otu_table, otu_table.SampleIds, seqs_per_sample, inf) # subsample samples that have too many sequences def func(x, s_id, s_md): if x.sum() < seqs_per_sample: return x else: return subsample_f(x, seqs_per_sample) subsampled_otu_table = otu_table.transformSamples(func) # remove small samples if required return subsampled_otu_table
def choose_cluster_subsets(otu_table_f, map_f, category, num_total_samples): otu_table = parse_biom_table(otu_table_f) metadata_map = MetadataMap.parseMetadataMap(map_f) # Dirty... :( try: map_f.seek(0) except AttributeError: pass if num_total_samples > len(otu_table.SampleIds): raise InvalidSubsetSize("Too many total samples (%d) were specified " "as a subset size. There are only %d total " "samples to choose a subset from." % (num_total_samples, len(otu_table.SampleIds))) category_map = defaultdict(list) for samp_id in metadata_map.SampleIds: # Mapping files can have more samples than OTU tables. if samp_id in otu_table.SampleIds: category_val = metadata_map.getCategoryValue(samp_id, category) category_map[category_val].append(samp_id) samp_ids_to_keep, extra_samps = _choose_items_from_clusters( category_map, otu_table.SampleIds, num_total_samples) samp_ids_to_keep.extend(extra_samps) assert len(samp_ids_to_keep) == num_total_samples, \ "%d != %d" % (len(samp_ids_to_keep), num_total_samples) assert len(samp_ids_to_keep) == len(set(samp_ids_to_keep)), \ "Duplicate sample IDs in subset" return (filter_samples_from_otu_table(otu_table, samp_ids_to_keep, 0, inf), filter_mapping_file_from_mapping_f(map_f, samp_ids_to_keep))
def choose_gradient_subset(otu_table_f, map_f, category, num_total_samples): otu_table = parse_biom_table(otu_table_f) mdm, _ = parse_mapping_file_to_dict(map_f) try: map_f.seek(0) except AttributeError: pass if num_total_samples > len(otu_table.SampleIds): raise InvalidSubsetSize("Too many total samples (%d) were specified " "as a gradient subset size. There are only %d " "total samples to choose a subset from." % (num_total_samples, len(otu_table.SampleIds))) # Only keep the sample IDs that are in both the mapping file and OTU table. # Sort the samples according to the gradient category. samp_ids = [(samp_id, float(metadata[category])) for samp_id, metadata in mdm.items() if samp_id in otu_table.SampleIds] samp_ids.sort(key=lambda samp_id: samp_id[1]) samp_ids_to_keep = [samp_id[0] for samp_id in _choose_items_from_bins(samp_ids, num_total_samples)] assert len(samp_ids_to_keep) == num_total_samples, \ "%d != %d" % (len(samp_ids_to_keep), num_total_samples) assert len(samp_ids_to_keep) == len(set(samp_ids_to_keep)), \ "Duplicate sample IDs in subset" return (filter_samples_from_otu_table(otu_table, samp_ids_to_keep, 0, inf), filter_mapping_file_from_mapping_f(map_f, samp_ids_to_keep))
def reconcile_hosts_symbionts(otu_file, host_dist): # filter cOTU table by samples present in host_tree/dm filtered_cotu_table = filter_samples_from_otu_table(otu_file, host_dist[0], negate=True) # Now the cOTU table only has the samples present in the host dm # parse the filtered cOTU table sample_names, taxon_names, data, lineages = parse_otu_table( filtered_cotu_table) # filter cOTU table again because skip_empty doesn't seem to be # working in format_otu_table called from # filter_samples_from_otu_table sample_names, taxon_names, data, lineages = filter_otu_table_by_min( sample_names, taxon_names, data, lineages, min=1) # Filter the host_dists to match the newly trimmed subtree # Note: this is requiring the modified filter_dist method which # returns a native dm tuple rather than a string. host_dist_filtered = filter_samples_from_distance_matrix( host_dist, sample_names, negate=True) filtered_otu_table_lines = format_otu_table( sample_names, taxon_names, data, lineages) return StringIO(filtered_otu_table_lines), host_dist_filtered
def get_rare_data(otu_table, seqs_per_sample, include_small_samples=False, subsample_f=subsample): """Filter OTU table to keep only desired sample sizes. - include_small_sampes=False => do not write samples with < seqs_per_sample total sequecnes - otu_table (input and out) is otus(rows) by samples (cols) - no otus are removed, even if they are absent in the rarefied table""" with errstate(empty='raise'): if not include_small_samples: otu_table = filter_samples_from_otu_table(otu_table, otu_table.ids(), seqs_per_sample, inf) # subsample samples that have too many sequences def func(x, s_id, s_md): if x.sum() < seqs_per_sample: return x else: return subsample_f(x.astype(int), seqs_per_sample) subsampled_otu_table = otu_table.transform(func, axis='sample') return subsampled_otu_table
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp mapping_fp = opts.mapping_fp output_mapping_fp = opts.output_mapping_fp valid_states = opts.valid_states min_count = opts.min_count max_count = opts.max_count sample_id_fp = opts.sample_id_fp if mapping_fp is None and valid_states is not None: option_parser.error("--mapping_fp must be provided if --valid_states " "is passed.") if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None): option_parser.error( "No filtering requested. Must provide either " "mapping_fp and valid states, min counts, " "max counts, or sample_id_fp (or some combination " "of those)." ) if (mapping_fp and valid_states) and sample_id_fp: option_parser.error("Providing both --sample_id_fp and " "--mapping_fp/--valid_states is not supported.") if output_mapping_fp and not mapping_fp: option_parser.error("Must provide input mapping file to generate" " output mapping file.") otu_table = load_table(opts.input_fp) negate_sample_id_fp = opts.negate_sample_id_fp if mapping_fp and valid_states: sample_ids_to_keep = sample_ids_from_metadata_description(open(mapping_fp, "U"), valid_states) negate_sample_id_fp = False else: sample_ids_to_keep = otu_table.ids() if sample_id_fp is not None: o = open(sample_id_fp, "U") sample_id_f_ids = set([l.strip().split()[0] for l in o if not l.startswith("#")]) o.close() sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids filtered_otu_table = filter_samples_from_otu_table( otu_table, sample_ids_to_keep, min_count, max_count, negate_ids_to_keep=negate_sample_id_fp ) try: write_biom_table(filtered_otu_table, output_fp) except EmptyBIOMTableError: option_parser.error( "Filtering resulted in an empty BIOM table. " "This indicates that no samples remained after filtering." ) # filter mapping file if requested if output_mapping_fp: mapping_data, mapping_headers, _ = parse_mapping_file(open(mapping_fp, "U")) mapping_headers, mapping_data = filter_mapping_file(mapping_data, mapping_headers, filtered_otu_table.ids()) open(output_mapping_fp, "w").write(format_mapping_file(mapping_headers, mapping_data))
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp mapping_fp = opts.mapping_fp output_mapping_fp = opts.output_mapping_fp valid_states = opts.valid_states min_count = opts.min_count max_count = opts.max_count sample_id_fp = opts.sample_id_fp if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None): option_parser.error("No filtering requested. Must provide either " "mapping_fp and valid states, min counts, " "max counts, or sample_id_fp (or some combination " "of those).") if output_mapping_fp and not mapping_fp: option_parser.error("Must provide input mapping file to generate" " output mapping file.") otu_table = load_table(opts.input_fp) if mapping_fp and valid_states: sample_ids_to_keep = sample_ids_from_metadata_description( open(mapping_fp, 'U'), valid_states) else: sample_ids_to_keep = otu_table.ids() if sample_id_fp is not None: sample_id_f_ids = set([l.strip().split()[0] for l in open(sample_id_fp, 'U') if not l.startswith('#')]) sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids filtered_otu_table = filter_samples_from_otu_table(otu_table, sample_ids_to_keep, min_count, max_count) write_biom_table(filtered_otu_table, output_fp) # filter mapping file if requested if output_mapping_fp: mapping_data, mapping_headers, _ = parse_mapping_file( open(mapping_fp, 'U')) mapping_headers, mapping_data = \ filter_mapping_file( mapping_data, mapping_headers, filtered_otu_table.ids()) open( output_mapping_fp, 'w').write( format_mapping_file( mapping_headers, mapping_data))
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp mapping_fp = opts.mapping_fp output_mapping_fp = opts.output_mapping_fp valid_states = opts.valid_states min_count = opts.min_count max_count = opts.max_count sample_id_fp = opts.sample_id_fp if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None): option_parser.error( "No filtering requested. Must provide either " "mapping_fp and valid states, min counts, " "max counts, or sample_id_fp (or some combination of those).") if output_mapping_fp and not mapping_fp: option_parser.error("Must provide input mapping file to generate" " output mapping file.") otu_table = parse_biom_table(open(opts.input_fp, 'U')) output_f = open(opts.output_fp, 'w') if (mapping_fp and valid_states): sample_ids_to_keep = sample_ids_from_metadata_description( open(mapping_fp, 'U'), valid_states) else: sample_ids_to_keep = otu_table.SampleIds if (sample_id_fp is not None): sample_id_f_ids = set([ l.strip().split()[0] for l in open(sample_id_fp, 'U') if not l.startswith('#') ]) sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids filtered_otu_table = filter_samples_from_otu_table(otu_table, sample_ids_to_keep, min_count, max_count) output_f.write(format_biom_table(filtered_otu_table)) output_f.close() # filter mapping file if requested if output_mapping_fp: mapping_data, mapping_headers, _ = parse_mapping_file( open(mapping_fp, 'U')) mapping_headers, mapping_data = \ filter_mapping_file( mapping_data, mapping_headers, filtered_otu_table.SampleIds) open(output_mapping_fp, 'w').write(format_mapping_file(mapping_headers, mapping_data))
def get_order_from_categories(otu_table, category_labels): """Groups samples by category values; clusters within each group""" category_labels = array(category_labels) sample_order = [] for label in unique(category_labels): label_ix = category_labels == label selected = [s for (i, s) in zip(label_ix, otu_table.ids()) if i] sub_otu_table = filter_samples_from_otu_table(otu_table, selected, -inf, inf) data = asarray([val for val in sub_otu_table.iter_data(axis="observation")]) label_ix_ix = get_clusters(data, axis="column") sample_order += list(nonzero(label_ix)[0][array(label_ix_ix)]) return array(sample_order)
def get_order_from_categories(otu_table, category_labels): """Groups samples by category values; clusters within each group""" category_labels = np.array(category_labels) sample_order = [] for label in np.unique(category_labels): label_ix = category_labels == label selected = [s for (i, s) in zip(label_ix, otu_table.ids()) if i] sub_otu_table = filter_samples_from_otu_table(otu_table, selected, -np.inf, np.inf) data = np.asarray(list(sub_otu_table.iter_data(axis='observation'))) label_ix_ix = get_clusters(data, axis='column') sample_order += list(np.nonzero(label_ix)[0][np.array(label_ix_ix)]) return np.array(sample_order)
def get_order_from_categories(otu_table, category_labels): """Groups samples by category values; clusters within each group""" category_labels = array(category_labels) sample_order = [] for label in unique(category_labels): label_ix = category_labels == label selected = [s for (i, s) in zip(label_ix, otu_table.SampleIds) if i] sub_otu_table = filter_samples_from_otu_table(otu_table, selected, 0, inf) data = asarray([val for val in sub_otu_table.iterObservationData()]) label_ix_ix = get_clusters(data, axis='column') sample_order += list(nonzero(label_ix)[0][array(label_ix_ix)]) return array(sample_order)
def get_overlapping_samples(map_rows, otu_table): """Extracts only samples contained in otu table and mapping file. Returns: new_map_rows, new_otu_table """ map_sample_ids = zip(*map_rows)[0] shared_ids = set(map_sample_ids) & set(otu_table.SampleIds) otu_table = filter_samples_from_otu_table(otu_table, shared_ids, 0, inf) new_map = [] for sam_id in map_sample_ids: if sam_id in shared_ids: ix = map_sample_ids.index(sam_id) new_map.append(map_rows[ix]) return new_map, otu_table
def get_overlapping_samples(map_rows, otu_table): """Extracts only samples contained in otu table and mapping file. Returns: new_map_rows, new_otu_table """ map_sample_ids = zip(*map_rows)[0] shared_ids = set(map_sample_ids) & set(otu_table.sample_ids) otu_table = filter_samples_from_otu_table(otu_table, shared_ids, -inf, inf) new_map = [] for sam_id in map_sample_ids: if sam_id in shared_ids: ix = map_sample_ids.index(sam_id) new_map.append(map_rows[ix]) return new_map, otu_table
def test_filter_samples_from_otu_table_negate(self): """filter_samples_from_otu_table functions w negate """ actual = filter_samples_from_otu_table(self.input_otu_table1, ["ABC blah","XYZ"], negate=True) self.assertEqual(actual,self.expected_otu_table1c)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) otu_table_fp = opts.otu_table_fp output_dir = opts.output_dir mapping_fp = opts.mapping_fp tree_fp = opts.tree_fp verbose = opts.verbose print_only = opts.print_only seqs_per_sample = int(opts.seqs_per_sample) parallel = opts.parallel min_seqs_sample = opts.min_seqs_sample subject_category = opts.subject_name try: makedirs(output_dir) except OSError: if opts.force: pass else: # Since the analysis can take quite a while, I put this check # in to help users avoid overwriting previous output. option_parser.error("Output directory already exists. Please choose" " a different directory, or force overwrite with -f.") ## ******************** make_evident_selectors ******************** ## The code for make_evident_selectors.py is here and has to go before the params ## validation as we need to know the main cats before creating the params file map_data, headers, comments = parse_mapping_file(open(mapping_fp, 'U')) biom_table = parse_biom_table(open(otu_table_fp, 'U')) # getting valid samples from biom file real_map_headers, real_map_data = filter_mapping_file(map_data, headers,\ biom_table.SampleIds, include_repeat_cols=False) if subject_category not in real_map_headers: option_parser.error('This column: %s is not in the mapping file, try %s'%\ (subject_category, real_map_headers)) sorted_counts_per_sample = get_sorted_counts_per_sample(biom_table) mapping_file_tuple = (real_map_data, real_map_headers) # calculate the available subjects at each rarefaction level results, main_map_cat = make_selectors(sorted_counts_per_sample, min_seqs_sample,\ mapping_file_tuple, subject_category, verbose=verbose) fout = open(join(output_dir,'selectors.txt'),'w') fout.write('#Sequences\tSubjects\tSamples\tMetadata\n') fout.write('\n'.join(results)) fout.close() fout = open(join(output_dir,'mapping_file.txt'),'w') fout.write(format_mapping_file(real_map_headers, real_map_data)) fout.close() ## ******************** make_evident_selectors ******************** fout = open(join(output_dir,'study_preferences.txt'),'w') fout.write('%d\n' % seqs_per_sample) fout.write('%s\n' % subject_category) fout.close() ## ******************** filter_samples_from_otu_table ******************** ## Filtering original biom file to only have samples above the max length to avoid ## ugly plots alpha_biom_file = join(output_dir,'filtered_otu_table_for_alpha.biom') fout = open(alpha_biom_file,'w') sample_ids_to_keep = biom_table.SampleIds filtered_otu_table = filter_samples_from_otu_table(biom_table, sample_ids_to_keep, min_count=seqs_per_sample, max_count=inf) fout.write(format_biom_table(filtered_otu_table)) fout.close() ## ******************** filter_samples_from_otu_table ******************** if opts.parameter_fp: try: parameter_f = open(opts.parameter_fp, 'U') except IOError: option_parser.error("Can't open parameters file (%s). Does it exist? " \ "Do you have read access?" % opts.parameter_fp) params = parse_qiime_parameters(parameter_f) parameter_f.close() else: params = parse_qiime_parameters( ['beta_diversity:metrics unweighted_unifrac',\ 'make_rarefaction_plots:prefs_path %s' % join(output_dir,'prefs.txt'), 'make_rarefaction_plots:colorby %s' % ','.join(main_map_cat), 'make_rarefaction_plots:output_type memory', 'multiple_rarefactions:min %d' % int(seqs_per_sample/4), 'multiple_rarefactions:max %d' % (seqs_per_sample+1), 'multiple_rarefactions:step %d' % int(seqs_per_sample/4), 'multiple_rarefactions:num-reps 4', ]) # empty list returns empty defaultdict for now jobs_to_start = opts.jobs_to_start default_jobs_to_start = qiime_config['jobs_to_start'] validate_and_set_jobs_to_start(params, jobs_to_start, default_jobs_to_start, parallel, option_parser) if print_only: command_handler = print_commands else: command_handler = call_commands_serially if verbose: status_update_callback = print_to_stdout else: status_update_callback = no_status_updates copyfile(otu_table_fp, join(output_dir,'raw.biom')) run_beta_diversity_through_plots(otu_table_fp=otu_table_fp, mapping_fp=mapping_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, color_by_interesting_fields_only=False, sampling_depth=seqs_per_sample, histogram_categories=None, tree_fp=tree_fp, parallel=parallel, suppress_3d_plots=True, suppress_2d_plots=True, status_update_callback=status_update_callback) output_dir = join(output_dir,'alpha') run_alpha_rarefaction(otu_table_fp=alpha_biom_file,\ mapping_fp=mapping_fp,\ output_dir=output_dir,\ command_handler=command_handler,\ params=params, qiime_config=qiime_config,\ tree_fp=tree_fp,\ num_steps=4,\ parallel=parallel,\ min_rare_depth=10, max_rare_depth=20, status_update_callback=status_update_callback, plot_stderr_and_stddev=True)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp mapping_fp = opts.mapping_fp output_mapping_fp = opts.output_mapping_fp valid_states = opts.valid_states min_count = opts.min_count max_count = opts.max_count sample_id_fp = opts.sample_id_fp if (mapping_fp is None and valid_states is not None): option_parser.error("--mapping_fp must be provided if --valid_states " "is passed.") if not ((mapping_fp and valid_states) or min_count != 0 or not isinf(max_count) or sample_id_fp is not None): option_parser.error("No filtering requested. Must provide either " "mapping_fp and valid states, min counts, " "max counts, or sample_id_fp (or some combination " "of those).") if (mapping_fp and valid_states) and sample_id_fp: option_parser.error("Providing both --sample_id_fp and " "--mapping_fp/--valid_states is not supported.") if output_mapping_fp and not mapping_fp: option_parser.error("Must provide input mapping file to generate" " output mapping file.") otu_table = load_table(opts.input_fp) negate_sample_id_fp = opts.negate_sample_id_fp if mapping_fp and valid_states: sample_ids_to_keep = sample_ids_from_metadata_description( open(mapping_fp, 'U'), valid_states) negate_sample_id_fp = False else: sample_ids_to_keep = otu_table.ids() if sample_id_fp is not None: o = open(sample_id_fp, 'U') sample_id_f_ids = set( [l.strip().split()[0] for l in o if not l.startswith('#')]) o.close() sample_ids_to_keep = set(sample_ids_to_keep) & sample_id_f_ids filtered_otu_table = filter_samples_from_otu_table( otu_table, sample_ids_to_keep, min_count, max_count, negate_ids_to_keep=negate_sample_id_fp) try: write_biom_table(filtered_otu_table, output_fp) except EmptyBIOMTableError: option_parser.error( "Filtering resulted in an empty BIOM table. " "This indicates that no samples remained after filtering.") # filter mapping file if requested if output_mapping_fp: mapping_data, mapping_headers, _ = parse_mapping_file( open(mapping_fp, 'U')) mapping_headers, mapping_data = \ filter_mapping_file( mapping_data, mapping_headers, filtered_otu_table.ids()) open(output_mapping_fp, 'w').write(format_mapping_file(mapping_headers, mapping_data))