def test_yield_subset_biom_str_yields_string_pieces_from_valid_input(self): """yield_subset_biom_str yields components of a biom string containing only a subset of ids, given a valid biom str""" biom_str = otu_table1_with_metadata ids_to_load = ['GG_OTU_1', 'GG_OTU_2'] axis = 'observations' idxs, new_axis_md = get_axis_indices(biom_str, ids_to_load, axis) new_data = direct_slice_data(biom_str, idxs, axis) #NOTE: this will fail currently due to a known bug in the BIOM direct_parse_key #as soon as that is updated this should pass, however obs = [ part for part in yield_subset_biom_str(biom_str, new_data, new_axis_md, axis) ] exp = ['{', '"id": "GG_OTU_1"', ',',\ '"format": "Biological Observation Matrix v0.9"', ',',\ '"format_url": "http://www.qiime.org/svn_documentation/documentation/biom_format.html"', ',',\ '"type": "OTU table"', ',',\ # '"generated_by": "QIIME 1.4.0-dev, svn revision 2753', ',',\ '"generated_by": "QIIME 1.4.0-dev', ',',\ '"date": "2012-02-22T20:50:05.024661"', ',',\ '"matrix_type": "sparse"', ',',\ '"matrix_element_type": "float"', ',',\ '"data": [[0,0,1.0],[0,1,2.0],[0,2,3.0],[0,3,5.0],[1,0,5.0],[1,1,1.0],[1,3,2.0]], "shape": [2, 4]',',',\ '"rows": [{"id": "GG_OTU_1", "metadata": null}, {"id": "GG_OTU_2", "metadata": null}]', ',',\ '"columns": [{"id": "Sample1", "metadata": {"pH":7.0}}, {"id": "Sample2", "metadata": {"pH":8.0}}, {"id": "Sample3", "metadata": {"pH":7.0}}, {"id": "Sample4", "metadata": null}]', '}'] #For now be aware that commas in generated_by #strings won't parse correnctly for i, piece in enumerate(exp): self.assertEqual(obs[i], piece)
def test_yield_subset_biom_str_yields_string_pieces_from_valid_input(self): """yield_subset_biom_str yields components of a biom string containing only a subset of ids, given a valid biom str""" biom_str = otu_table1_with_metadata ids_to_load = ['GG_OTU_1','GG_OTU_2'] axis = 'observation' idxs, new_axis_md = get_axis_indices(biom_str,ids_to_load, axis) new_data = direct_slice_data(biom_str,idxs, axis) #NOTE: this will fail currently due to a known bug in the BIOM direct_parse_key #as soon as that is updated this should pass, however obs = [part for part in yield_subset_biom_str(biom_str,new_data,new_axis_md,axis)] exp = ['{', '"id": "GG_OTU_1"', ',',\ '"format": "Biological Observation Matrix v0.9"', ',',\ '"format_url": "http://www.qiime.org/svn_documentation/documentation/biom_format.html"', ',',\ '"type": "Gene table"', ',',\ # '"generated_by": "QIIME 1.4.0-dev, svn revision 2753', ',',\ '"generated_by": "QIIME 1.4.0-dev', ',',\ '"date": "2012-02-22T20:50:05.024661"', ',',\ '"matrix_type": "sparse"', ',',\ '"matrix_element_type": "float"', ',',\ '"data": [[0,0,1.0],[0,1,2.0],[0,2,3.0],[0,3,5.0],[1,0,5.0],[1,1,1.0],[1,3,2.0]], "shape": [2, 4]',',',\ '"rows": [{"id": "GG_OTU_1", "metadata": null}, {"id": "GG_OTU_2", "metadata": null}]', ',',\ '"columns": [{"id": "Sample1", "metadata": {"pH":7.0}}, {"id": "Sample2", "metadata": {"pH":8.0}}, {"id": "Sample3", "metadata": {"pH":7.0}}, {"id": "Sample4", "metadata": null}]', '}'] #For now be aware that commas in generated_by #strings won't parse correnctly for i,piece in enumerate(exp): self.assertEqual(obs[i],piece)
def run(self, **kwargs): json_table_str = kwargs['json_table_str'] hdf5_biom = kwargs['hdf5_table'] axis = kwargs['axis'] ids = kwargs['ids'] if axis not in self.Axes: raise CommandError("Invalid axis '%s'. Must be either %s." % ( axis, ' or '.join(map(lambda e: "'%s'" % e, self.Axes)))) if hdf5_biom is None and json_table_str is None: raise CommandError("Must specify an input table") elif hdf5_biom is not None and json_table_str is not None: raise CommandError("Can only specify one input table") if json_table_str is not None: idxs, new_axis_md = get_axis_indices(json_table_str, ids, axis) new_data = direct_slice_data(json_table_str, idxs, axis) # multiple walks over the string. bad form, but easy right now # ...should add a yield_and_ignore parser or something. def subset_generator(): yield "{" yield direct_parse_key(json_table_str, "id") yield "," yield direct_parse_key(json_table_str, "format") yield "," yield direct_parse_key(json_table_str, "format_url") yield "," yield direct_parse_key(json_table_str, "type") yield "," yield direct_parse_key(json_table_str, "generated_by") yield "," yield direct_parse_key(json_table_str, "date") yield "," yield direct_parse_key(json_table_str, "matrix_type") yield "," yield direct_parse_key(json_table_str, "matrix_element_type") yield "," yield new_data yield "," yield new_axis_md yield "," if axis == "observation": yield direct_parse_key(json_table_str, "columns") else: yield direct_parse_key(json_table_str, "rows") yield "}" format_ = 'json' table = subset_generator() else: with biom_open(hdf5_biom) as f: table = Table.from_hdf5(f, ids=ids, axis=axis) format_ = 'hdf5' return {'subsetted_table': (table, format_)}
def run(self, **kwargs): json_table_str = kwargs['json_table_str'] hdf5_biom = kwargs['hdf5_table'] axis = kwargs['axis'] ids = kwargs['ids'] if axis not in self.Axes: raise CommandError( "Invalid axis '%s'. Must be either %s." % (axis, ' or '.join(map(lambda e: "'%s'" % e, self.Axes)))) if hdf5_biom is None and json_table_str is None: raise CommandError("Must specify an input table") elif hdf5_biom is not None and json_table_str is not None: raise CommandError("Can only specify one input table") if json_table_str is not None: idxs, new_axis_md = get_axis_indices(json_table_str, ids, axis) new_data = direct_slice_data(json_table_str, idxs, axis) # multiple walks over the string. bad form, but easy right now # ...should add a yield_and_ignore parser or something. def subset_generator(): yield "{" yield direct_parse_key(json_table_str, "id") yield "," yield direct_parse_key(json_table_str, "format") yield "," yield direct_parse_key(json_table_str, "format_url") yield "," yield direct_parse_key(json_table_str, "type") yield "," yield direct_parse_key(json_table_str, "generated_by") yield "," yield direct_parse_key(json_table_str, "date") yield "," yield direct_parse_key(json_table_str, "matrix_type") yield "," yield direct_parse_key(json_table_str, "matrix_element_type") yield "," yield new_data yield "," yield new_axis_md yield "," if axis == "observation": yield direct_parse_key(json_table_str, "columns") else: yield direct_parse_key(json_table_str, "rows") yield "}" format_ = 'json' table = subset_generator() else: with biom_open(hdf5_biom) as f: table = Table.from_hdf5(f, ids=ids, axis=axis) format_ = 'hdf5' return {'subsetted_table': (table, format_)}
def _subset_table(hdf5_biom, json_table_str, axis, ids): if axis not in ['sample', 'observation']: raise ValueError("Invalid axis '%s'. Must be either 'sample' or " "'observation'." % axis) if hdf5_biom is None and json_table_str is None: raise ValueError("Must specify an input table") elif hdf5_biom is not None and json_table_str is not None: raise ValueError("Can only specify one input table") if json_table_str is not None: idxs, new_axis_md = get_axis_indices(json_table_str, ids, axis) new_data = direct_slice_data(json_table_str, idxs, axis) # multiple walks over the string. bad form, but easy right now # ...should add a yield_and_ignore parser or something. def subset_generator(): yield "{" yield direct_parse_key(json_table_str, "id") yield "," yield direct_parse_key(json_table_str, "format") yield "," yield direct_parse_key(json_table_str, "format_url") yield "," yield direct_parse_key(json_table_str, "type") yield "," yield direct_parse_key(json_table_str, "generated_by") yield "," yield direct_parse_key(json_table_str, "date") yield "," yield direct_parse_key(json_table_str, "matrix_type") yield "," yield direct_parse_key(json_table_str, "matrix_element_type") yield "," yield new_data yield "," yield new_axis_md yield "," if axis == "observation": yield direct_parse_key(json_table_str, "columns") else: yield direct_parse_key(json_table_str, "rows") yield "}" format_ = 'json' table = subset_generator() else: with biom_open(hdf5_biom) as f: table = Table.from_hdf5(f, ids=ids, axis=axis) format_ = 'hdf5' return table, format_
def load_subset_from_biom_str(biom_str, ids_to_load, axis="samples"): """Load a biom table containing subset of samples or observations from a BIOM format JSON string""" if axis not in ['samples', 'observations']: raise InputError(\ 'load_subset_from_biom_str axis parameter must be either "samples" or "observations"') ids = map(str, [l.strip() for l in ids_to_load]) idxs, new_axis_md = get_axis_indices(biom_str, ids, axis) new_data = direct_slice_data(biom_str, idxs, axis) new_table_pieces = yield_subset_biom_str(biom_str, new_data, new_axis_md, axis) subset_biom_str = ''.join(new_table_pieces) return parse_biom_table(subset_biom_str)
def load_subset_from_biom_str(biom_str,ids_to_load,axis="samples"): """Load a biom table containing subset of samples or observations from a BIOM format JSON string""" if axis not in ['samples','observations']: raise InputError(\ 'load_subset_from_biom_str axis parameter must be either "samples" or "observations"') ids = map(str,[l.strip() for l in ids_to_load]) idxs, new_axis_md = get_axis_indices(biom_str, ids, axis) new_data = direct_slice_data(biom_str, idxs, axis) new_table_pieces = yield_subset_biom_str(biom_str,new_data,new_axis_md,axis) subset_biom_str = ''.join(new_table_pieces) return parse_biom_table(subset_biom_str)
def run(self, **kwargs): table_str = kwargs['table_str'] axis = kwargs['axis'] ids = kwargs['ids'] if axis not in self.Axes: raise CommandError("Invalid axis '%s'. Must be either %s." % ( axis, ' or '.join(map(lambda e: "'%s'" % e, self.Axes)))) idxs, new_axis_md = get_axis_indices(table_str, ids, axis) new_data = direct_slice_data(table_str, idxs, axis) # multiple walks over the string. bad form, but easy right now # ...should add a yield_and_ignore parser or something. def subset_generator(): yield "{" yield direct_parse_key(table_str, "id") yield "," yield direct_parse_key(table_str, "format") yield "," yield direct_parse_key(table_str, "format_url") yield "," yield direct_parse_key(table_str, "type") yield "," yield direct_parse_key(table_str, "generated_by") yield "," yield direct_parse_key(table_str, "date") yield "," yield direct_parse_key(table_str, "matrix_type") yield "," yield direct_parse_key(table_str, "matrix_element_type") yield "," yield new_data yield "," yield new_axis_md yield "," if axis == "observations": yield direct_parse_key(table_str, "columns") else: yield direct_parse_key(table_str, "rows") yield "}" return {'subset_generator': subset_generator()}
make_option('-o','--output_fp',type="string", help="A file to write the result to") ] if __name__ == '__main__': if cogent_cl_parsing: option_parser, opts, args =\ parse_command_line_parameters(**script_info) else: parser = OptionParser(option_list=options) opts, args = parser.parse_args() ids = [l.strip() for l in open(opts.ids_fp)] biom_str = open(opts.biom_fp).read() idxs, new_axis_md = get_axis_indices(biom_str, ids, opts.axis) new_data = direct_slice_data(biom_str, idxs, opts.axis) output = open(opts.output_fp,'w') # multiple walks over the file. bad form, but easy right now # ...should add a yield_and_ignore parser or something. output.write('{') output.write(direct_parse_key(biom_str, "id")) output.write(",") output.write(direct_parse_key(biom_str, "format")) output.write(",") output.write(direct_parse_key(biom_str, "format_url")) output.write(",") output.write(direct_parse_key(biom_str, "type")) output.write(",") output.write(direct_parse_key(biom_str, "generated_by"))