def test_parse_metadata_state_descriptions(self): """parse_metadata_state_descriptions should return correct states from string.""" s = '' self.assertEqual(parse_metadata_state_descriptions(s), {}) s = 'Study:Twin,Hand,Dog;BodySite:Palm,Stool' self.assertEqual(parse_metadata_state_descriptions(s), {'Study':set(['Twin','Hand','Dog']), 'BodySite':set(['Palm','Stool'])})
def get_sam_ids(map_data, map_header, colorby, cat, primary_state, secondary_state): """ returns all sample ids matching the state strings and colorby:cat colorby: eg: 'Country', or pass None to not filter only colorby:cat samples cat: e.g.: 'USA' primary_state: e.g.: 'AgeCategory:Child' secondary state can be None, or like primary state returns uniquified lists in randomized order """ if colorby == None: sample_ids = [sam[0] for sam in map_data] else: sample_ids = get_sample_ids(map_data, map_header, {colorby: [cat]}) # primary key is the category label, e.g. AgeCategory # value is the val for that category, e.g. Adult # go through age1/age2 primary_states = parse_metadata_state_descriptions(primary_state) if colorby != None: primary_states[colorby] = [cat] state1_samids = get_sample_ids(map_data, map_header, primary_states) if secondary_state == None: state2_samids = set(sample_ids).difference(set(state1_samids)) else: secondary_states = parse_metadata_state_descriptions(secondary_state) if colorby != None: secondary_states[colorby] = [cat] state2_samids = get_sample_ids(map_data, map_header, secondary_states) return list(set(state1_samids)), list(set(state2_samids))
def get_sam_ids(map_data, map_header, colorby, cat, primary_state, secondary_state): """ returns all sample ids matching the state strings and colorby:cat colorby: eg: 'Country', or pass None to not filter only colorby:cat samples cat: e.g.: 'USA' primary_state: e.g.: 'AgeCategory:Child' secondary state can be None, or like primary state returns uniquified lists in randomized order """ if colorby is None: sample_ids = [sam[0] for sam in map_data] else: sample_ids = get_sample_ids(map_data, map_header, {colorby: [cat]}) # primary key is the category label, e.g. AgeCategory # value is the val for that category, e.g. Adult # go through age1/age2 primary_states = parse_metadata_state_descriptions(primary_state) if colorby is not None: primary_states[colorby] = [cat] state1_samids = get_sample_ids(map_data, map_header, primary_states) if secondary_state is None: state2_samids = set(sample_ids).difference(set(state1_samids)) else: secondary_states =\ parse_metadata_state_descriptions(secondary_state) if colorby is not None: secondary_states[colorby] = [cat] state2_samids = get_sample_ids(map_data, map_header, secondary_states) return list(set(state1_samids)), list(set(state2_samids))
def test_parse_metadata_state_descriptions(self): """parse_metadata_state_descriptions should return correct states from string.""" s = '' self.assertEqual(parse_metadata_state_descriptions(s), {}) s = 'Study:Twin,Hand,Dog;BodySite:Palm,Stool' self.assertEqual( parse_metadata_state_descriptions(s), { 'Study': set(['Twin', 'Hand', 'Dog']), 'BodySite': set(['Palm', 'Stool']) })
def test_get_sample_ids(self): """get_sample_ids should return sample ids matching criteria.""" self.assertEqual(get_sample_ids(self.map_data, self.map_headers,\ parse_metadata_state_descriptions('Study:Twin')), []) self.assertEqual(get_sample_ids(self.map_data, self.map_headers,\ parse_metadata_state_descriptions('Study:Dog')), ['a','b']) self.assertEqual(get_sample_ids(self.map_data, self.map_headers,\ parse_metadata_state_descriptions('Study:*,!Dog')), ['c','d','e']) self.assertEqual(get_sample_ids(self.map_data, self.map_headers,\ parse_metadata_state_descriptions('Study:*,!Dog;BodySite:Stool')), ['e']) self.assertEqual(get_sample_ids(self.map_data, self.map_headers,\ parse_metadata_state_descriptions('BodySite:Stool')), ['a','b','e'])
def sample_ids_from_metadata_description(mapping_f,valid_states_str): """ Given a description of metadata, return the corresponding sample ids """ map_data, map_header, map_comments = parse_mapping_file(mapping_f) valid_states = parse_metadata_state_descriptions(valid_states_str) sample_ids = get_sample_ids(map_data, map_header, valid_states) return sample_ids
def filter_otus_and_map(map_infile, otu_infile, map_outfile, otu_outfile, valid_states_str, num_seqs_per_otu): """Filters OTU and map files according to specified criteria.""" map_data, map_header, map_comments = parse_mapping_file(map_infile) map_infile.close() valid_states = parse_metadata_state_descriptions(valid_states_str) sample_ids = get_sample_ids(map_data, map_header, valid_states) # write out the filtered mapping file out_headers, out_data = filter_map(map_data, map_header, sample_ids) header_line = '#' + '\t'.join(out_headers) map_outfile.write('\n'.join([header_line] + map('\t'.join, out_data))) if not isinstance(map_outfile, StringIO): map_outfile.close() # write out the filtered OTU file for line in otu_infile: if line.startswith('#OTU ID'): fields = map(strip, line.split('\t')) cols = find_good_cols(line, sample_ids) filter_line(line, cols, min_count=None, outfile=otu_outfile) elif line.startswith('#'): otu_outfile.write(line) else: filter_line(line, cols, min_count=num_seqs_per_otu, outfile=otu_outfile) if not isinstance(otu_outfile, StringIO): otu_outfile.close()
def sample_ids_from_metadata_description(mapping_f, valid_states_str): """ Given a description of metadata, return the corresponding sample ids """ map_data, map_header, map_comments = parse_mapping_file(mapping_f) valid_states = parse_metadata_state_descriptions(valid_states_str) sample_ids = get_sample_ids(map_data, map_header, valid_states) return sample_ids
def sample_ids_from_metadata_description(mapping_f, valid_states_str): """ Given a description of metadata, return the corresponding sample ids """ map_data, map_header, map_comments = parse_mapping_file(mapping_f) valid_states = parse_metadata_state_descriptions(valid_states_str) sample_ids = get_sample_ids(map_data, map_header, valid_states) if len(sample_ids) < 1: raise ValueError("All samples have been filtered out for the criteria" " described in the valid states") return sample_ids
def sample_ids_from_metadata_description(mapping_f, valid_states_str): """ Given a description of metadata, return the corresponding sample ids """ map_data, map_header, map_comments = parse_mapping_file(mapping_f) valid_states = parse_metadata_state_descriptions(valid_states_str) sample_ids = get_sample_ids(map_data, map_header, valid_states) if len(sample_ids) < 1: raise ValueError,"All samples have been filtered out for the criteria"+\ " described in the valid states" return sample_ids