def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) biom_table_fp = opts.biom_table_fp mapping_fp = opts.mapping_fp fields = opts.fields.split(',') output_dir = opts.output_dir suppress_mf = opts.suppress_mapping_file_output # column_rename_ids = opts.column_rename_ids # include_repeat_cols = opts.include_repeat_cols bt = load_table(biom_table_fp) mdata, mheaders, mcomments = parse_mapping_file(mapping_fp) mdata = array(mdata) # check that biom file and mapping file have matching sample names. discard # those samples that do not appear in both. shared_samples = list(set(mdata[:, 0]).intersection(bt.ids(axis='sample'))) if len(shared_samples) == 0: raise ValueError('Mapping file and biom table share no samples.') elif len(shared_samples) == len(mdata[:, 0]): mdata = array(mdata) else: # we want to preserve the order of the samples in the biom table ss_bt_order = [s for s in bt.ids(axis='sample') if s in shared_samples] bt = bt.filter(ss_bt_order, axis='sample', inplace=True) mdata = subset_mapping_data(mdata, shared_samples) # check that headers in mapping data if not all([i in mheaders for i in fields]): raise ValueError('One or more of the specified fields was not found ' +\ 'in the mapping file.') # create output directory and create base names create_dir(output_dir) mf_base_name = join(output_dir, splitext(split(mapping_fp)[1])[0]) bt_base_name = join(output_dir, splitext(split(biom_table_fp)[1])[0]) # run code and append output sample_groups, value_groups = make_non_empty_sample_lists(fields, mheaders, mdata) for sg, vg in zip(sample_groups, value_groups): name_base = '__' + '%s_%s_' * len(vg) + '_' name_tmp = [] for f, v in zip(fields, vg): name_tmp.extend([f, v]) nb = name_base % tuple(name_tmp) tmp_mf_data = subset_mapping_data(mdata, sg) tmp_mf_str = format_mapping_file(mheaders, tmp_mf_data, mcomments) write_biom_table(bt.filter(sg, axis='sample', inplace=False), bt_base_name + nb + '.biom') if not suppress_mf: o = open(mf_base_name + nb + '.txt', 'w') o.writelines(tmp_mf_str) o.close()
def test_subset_mapping_data(self): """Test that mapping data is subset correctly.""" samples_of_interest = ['s0', 's1', 's2'] obs = subset_mapping_data(self.mdata, samples_of_interest) exp = array([['s0', 'blue', 'hot', '13'], ['s1', 'blue', 'cold', '1'], ['s2', 'green', 'cold', '12']], dtype='|S5') assert_array_equal(obs, exp) samples_of_interest = ['s1', 's2', 's0'] #order change won't affect out obs = subset_mapping_data(self.mdata, samples_of_interest) assert_array_equal(obs, exp) samples_of_interest = ['s4', 's0'] #order change won't affect out obs = subset_mapping_data(self.mdata, samples_of_interest) exp = array([['s0', 'blue', 'hot', '13'], ['s4', 'blue', '0', '0']], dtype='|S5') assert_array_equal(obs, exp)