Пример #1
0
    def test_yield_subset_biom_str_yields_string_pieces_from_valid_input(self):
        """yield_subset_biom_str yields components of a biom string containing only a subset of ids, given a valid biom str"""
        biom_str = otu_table1_with_metadata
        ids_to_load = ['GG_OTU_1', 'GG_OTU_2']
        axis = 'observations'

        idxs, new_axis_md = get_axis_indices(biom_str, ids_to_load, axis)
        new_data = direct_slice_data(biom_str, idxs, axis)

        #NOTE: this will fail currently due to a known bug in the BIOM direct_parse_key
        #as soon as that is updated this should pass, however
        obs = [
            part for part in yield_subset_biom_str(biom_str, new_data,
                                                   new_axis_md, axis)
        ]
        exp = ['{', '"id": "GG_OTU_1"', ',',\
                '"format": "Biological Observation Matrix v0.9"', ',',\
                '"format_url": "http://www.qiime.org/svn_documentation/documentation/biom_format.html"', ',',\
                '"type": "OTU table"', ',',\
#                '"generated_by": "QIIME 1.4.0-dev, svn revision 2753', ',',\
                '"generated_by": "QIIME 1.4.0-dev', ',',\
                '"date": "2012-02-22T20:50:05.024661"', ',',\
                '"matrix_type": "sparse"', ',',\
                '"matrix_element_type": "float"', ',',\
                '"data": [[0,0,1.0],[0,1,2.0],[0,2,3.0],[0,3,5.0],[1,0,5.0],[1,1,1.0],[1,3,2.0]], "shape": [2, 4]',',',\
                '"rows": [{"id": "GG_OTU_1", "metadata": null}, {"id": "GG_OTU_2", "metadata": null}]', ',',\
                '"columns": [{"id": "Sample1", "metadata": {"pH":7.0}}, {"id": "Sample2", "metadata": {"pH":8.0}}, {"id": "Sample3", "metadata": {"pH":7.0}}, {"id": "Sample4", "metadata": null}]', '}']
        #For now be aware that commas in generated_by
        #strings won't parse correnctly
        for i, piece in enumerate(exp):
            self.assertEqual(obs[i], piece)
    def test_yield_subset_biom_str_yields_string_pieces_from_valid_input(self):
        """yield_subset_biom_str yields components of a biom string containing only a subset of ids, given a valid biom str"""
        biom_str = otu_table1_with_metadata
        ids_to_load = ['GG_OTU_1','GG_OTU_2']
        axis = 'observation'

        idxs, new_axis_md = get_axis_indices(biom_str,ids_to_load, axis)
        new_data = direct_slice_data(biom_str,idxs, axis)

        #NOTE: this will fail currently due to a known bug in the BIOM direct_parse_key
        #as soon as that is updated this should pass, however
        obs = [part for part in yield_subset_biom_str(biom_str,new_data,new_axis_md,axis)]
        exp = ['{', '"id": "GG_OTU_1"', ',',\
                '"format": "Biological Observation Matrix v0.9"', ',',\
                '"format_url": "http://www.qiime.org/svn_documentation/documentation/biom_format.html"', ',',\
                '"type": "Gene table"', ',',\
#                '"generated_by": "QIIME 1.4.0-dev, svn revision 2753', ',',\
                '"generated_by": "QIIME 1.4.0-dev', ',',\
                '"date": "2012-02-22T20:50:05.024661"', ',',\
                '"matrix_type": "sparse"', ',',\
                '"matrix_element_type": "float"', ',',\
                '"data": [[0,0,1.0],[0,1,2.0],[0,2,3.0],[0,3,5.0],[1,0,5.0],[1,1,1.0],[1,3,2.0]], "shape": [2, 4]',',',\
                '"rows": [{"id": "GG_OTU_1", "metadata": null}, {"id": "GG_OTU_2", "metadata": null}]', ',',\
                '"columns": [{"id": "Sample1", "metadata": {"pH":7.0}}, {"id": "Sample2", "metadata": {"pH":8.0}}, {"id": "Sample3", "metadata": {"pH":7.0}}, {"id": "Sample4", "metadata": null}]', '}']
        #For now be aware that commas in generated_by
        #strings won't parse correnctly
        for i,piece in enumerate(exp):
            self.assertEqual(obs[i],piece)
Пример #3
0
    def run(self, **kwargs):
        json_table_str = kwargs['json_table_str']
        hdf5_biom = kwargs['hdf5_table']
        axis = kwargs['axis']
        ids = kwargs['ids']

        if axis not in self.Axes:
            raise CommandError("Invalid axis '%s'. Must be either %s." % (
                axis,
                ' or '.join(map(lambda e: "'%s'" % e, self.Axes))))

        if hdf5_biom is None and json_table_str is None:
            raise CommandError("Must specify an input table")
        elif hdf5_biom is not None and json_table_str is not None:
            raise CommandError("Can only specify one input table")

        if json_table_str is not None:
            idxs, new_axis_md = get_axis_indices(json_table_str, ids, axis)
            new_data = direct_slice_data(json_table_str, idxs, axis)

            # multiple walks over the string. bad form, but easy right now
            # ...should add a yield_and_ignore parser or something.
            def subset_generator():
                yield "{"
                yield direct_parse_key(json_table_str, "id")
                yield ","
                yield direct_parse_key(json_table_str, "format")
                yield ","
                yield direct_parse_key(json_table_str, "format_url")
                yield ","
                yield direct_parse_key(json_table_str, "type")
                yield ","
                yield direct_parse_key(json_table_str, "generated_by")
                yield ","
                yield direct_parse_key(json_table_str, "date")
                yield ","
                yield direct_parse_key(json_table_str, "matrix_type")
                yield ","
                yield direct_parse_key(json_table_str, "matrix_element_type")
                yield ","
                yield new_data
                yield ","
                yield new_axis_md
                yield ","

                if axis == "observation":
                    yield direct_parse_key(json_table_str, "columns")
                else:
                    yield direct_parse_key(json_table_str, "rows")
                yield "}"

            format_ = 'json'
            table = subset_generator()
        else:
            with biom_open(hdf5_biom) as f:
                table = Table.from_hdf5(f, ids=ids, axis=axis)
            format_ = 'hdf5'

        return {'subsetted_table': (table, format_)}
Пример #4
0
    def run(self, **kwargs):
        json_table_str = kwargs['json_table_str']
        hdf5_biom = kwargs['hdf5_table']
        axis = kwargs['axis']
        ids = kwargs['ids']

        if axis not in self.Axes:
            raise CommandError(
                "Invalid axis '%s'. Must be either %s." %
                (axis, ' or '.join(map(lambda e: "'%s'" % e, self.Axes))))

        if hdf5_biom is None and json_table_str is None:
            raise CommandError("Must specify an input table")
        elif hdf5_biom is not None and json_table_str is not None:
            raise CommandError("Can only specify one input table")

        if json_table_str is not None:
            idxs, new_axis_md = get_axis_indices(json_table_str, ids, axis)
            new_data = direct_slice_data(json_table_str, idxs, axis)

            # multiple walks over the string. bad form, but easy right now
            # ...should add a yield_and_ignore parser or something.
            def subset_generator():
                yield "{"
                yield direct_parse_key(json_table_str, "id")
                yield ","
                yield direct_parse_key(json_table_str, "format")
                yield ","
                yield direct_parse_key(json_table_str, "format_url")
                yield ","
                yield direct_parse_key(json_table_str, "type")
                yield ","
                yield direct_parse_key(json_table_str, "generated_by")
                yield ","
                yield direct_parse_key(json_table_str, "date")
                yield ","
                yield direct_parse_key(json_table_str, "matrix_type")
                yield ","
                yield direct_parse_key(json_table_str, "matrix_element_type")
                yield ","
                yield new_data
                yield ","
                yield new_axis_md
                yield ","

                if axis == "observation":
                    yield direct_parse_key(json_table_str, "columns")
                else:
                    yield direct_parse_key(json_table_str, "rows")
                yield "}"

            format_ = 'json'
            table = subset_generator()
        else:
            with biom_open(hdf5_biom) as f:
                table = Table.from_hdf5(f, ids=ids, axis=axis)
            format_ = 'hdf5'

        return {'subsetted_table': (table, format_)}
Пример #5
0
def _subset_table(hdf5_biom, json_table_str, axis, ids):
    if axis not in ['sample', 'observation']:
        raise ValueError("Invalid axis '%s'. Must be either 'sample' or "
                         "'observation'." % axis)

    if hdf5_biom is None and json_table_str is None:
        raise ValueError("Must specify an input table")
    elif hdf5_biom is not None and json_table_str is not None:
        raise ValueError("Can only specify one input table")

    if json_table_str is not None:
        idxs, new_axis_md = get_axis_indices(json_table_str, ids, axis)
        new_data = direct_slice_data(json_table_str, idxs, axis)

        # multiple walks over the string. bad form, but easy right now
        # ...should add a yield_and_ignore parser or something.
        def subset_generator():
            yield "{"
            yield direct_parse_key(json_table_str, "id")
            yield ","
            yield direct_parse_key(json_table_str, "format")
            yield ","
            yield direct_parse_key(json_table_str, "format_url")
            yield ","
            yield direct_parse_key(json_table_str, "type")
            yield ","
            yield direct_parse_key(json_table_str, "generated_by")
            yield ","
            yield direct_parse_key(json_table_str, "date")
            yield ","
            yield direct_parse_key(json_table_str, "matrix_type")
            yield ","
            yield direct_parse_key(json_table_str, "matrix_element_type")
            yield ","
            yield new_data
            yield ","
            yield new_axis_md
            yield ","

            if axis == "observation":
                yield direct_parse_key(json_table_str, "columns")
            else:
                yield direct_parse_key(json_table_str, "rows")
            yield "}"

        format_ = 'json'
        table = subset_generator()
    else:
        with biom_open(hdf5_biom) as f:
            table = Table.from_hdf5(f, ids=ids, axis=axis)
        format_ = 'hdf5'

    return table, format_
Пример #6
0
def _subset_table(hdf5_biom, json_table_str, axis, ids):
    if axis not in ['sample', 'observation']:
        raise ValueError("Invalid axis '%s'. Must be either 'sample' or "
                         "'observation'." % axis)

    if hdf5_biom is None and json_table_str is None:
        raise ValueError("Must specify an input table")
    elif hdf5_biom is not None and json_table_str is not None:
        raise ValueError("Can only specify one input table")

    if json_table_str is not None:
        idxs, new_axis_md = get_axis_indices(json_table_str, ids, axis)
        new_data = direct_slice_data(json_table_str, idxs, axis)

        # multiple walks over the string. bad form, but easy right now
        # ...should add a yield_and_ignore parser or something.
        def subset_generator():
            yield "{"
            yield direct_parse_key(json_table_str, "id")
            yield ","
            yield direct_parse_key(json_table_str, "format")
            yield ","
            yield direct_parse_key(json_table_str, "format_url")
            yield ","
            yield direct_parse_key(json_table_str, "type")
            yield ","
            yield direct_parse_key(json_table_str, "generated_by")
            yield ","
            yield direct_parse_key(json_table_str, "date")
            yield ","
            yield direct_parse_key(json_table_str, "matrix_type")
            yield ","
            yield direct_parse_key(json_table_str, "matrix_element_type")
            yield ","
            yield new_data
            yield ","
            yield new_axis_md
            yield ","

            if axis == "observation":
                yield direct_parse_key(json_table_str, "columns")
            else:
                yield direct_parse_key(json_table_str, "rows")
            yield "}"

        format_ = 'json'
        table = subset_generator()
    else:
        with biom_open(hdf5_biom) as f:
            table = Table.from_hdf5(f, ids=ids, axis=axis)
        format_ = 'hdf5'

    return table, format_
Пример #7
0
def load_subset_from_biom_str(biom_str, ids_to_load, axis="samples"):
    """Load a biom table containing subset of samples or observations from a BIOM format JSON string"""
    if axis not in ['samples', 'observations']:
        raise InputError(\
          'load_subset_from_biom_str axis parameter must be either "samples" or "observations"')

    ids = map(str, [l.strip() for l in ids_to_load])

    idxs, new_axis_md = get_axis_indices(biom_str, ids, axis)
    new_data = direct_slice_data(biom_str, idxs, axis)

    new_table_pieces = yield_subset_biom_str(biom_str, new_data, new_axis_md,
                                             axis)
    subset_biom_str = ''.join(new_table_pieces)
    return parse_biom_table(subset_biom_str)
Пример #8
0
def load_subset_from_biom_str(biom_str,ids_to_load,axis="samples"):
    """Load a biom table containing subset of samples or observations from a BIOM format JSON string"""
    if axis not in ['samples','observations']:
        raise InputError(\
          'load_subset_from_biom_str axis parameter must be either "samples" or "observations"')

    ids = map(str,[l.strip() for l in ids_to_load])

    idxs, new_axis_md = get_axis_indices(biom_str, ids, axis)
    new_data = direct_slice_data(biom_str, idxs, axis)


    new_table_pieces = yield_subset_biom_str(biom_str,new_data,new_axis_md,axis)
    subset_biom_str = ''.join(new_table_pieces)
    return parse_biom_table(subset_biom_str)
Пример #9
0
    def run(self, **kwargs):
        table_str = kwargs['table_str']
        axis = kwargs['axis']
        ids = kwargs['ids']

        if axis not in self.Axes:
            raise CommandError("Invalid axis '%s'. Must be either %s." % (
                axis,
                ' or '.join(map(lambda e: "'%s'" % e, self.Axes))))

        idxs, new_axis_md = get_axis_indices(table_str, ids, axis)
        new_data = direct_slice_data(table_str, idxs, axis)

        # multiple walks over the string. bad form, but easy right now
        # ...should add a yield_and_ignore parser or something.
        def subset_generator():
            yield "{"
            yield direct_parse_key(table_str, "id")
            yield ","
            yield direct_parse_key(table_str, "format")
            yield ","
            yield direct_parse_key(table_str, "format_url")
            yield ","
            yield direct_parse_key(table_str, "type")
            yield ","
            yield direct_parse_key(table_str, "generated_by")
            yield ","
            yield direct_parse_key(table_str, "date")
            yield ","
            yield direct_parse_key(table_str, "matrix_type")
            yield ","
            yield direct_parse_key(table_str, "matrix_element_type")
            yield ","
            yield new_data
            yield ","
            yield new_axis_md
            yield ","

            if axis == "observations":
                yield direct_parse_key(table_str, "columns")
            else:
                yield direct_parse_key(table_str, "rows")
            yield "}"

        return {'subset_generator': subset_generator()}
Пример #10
0
                 help="A file to write the result to")
    ]
    
if __name__ == '__main__':
    if cogent_cl_parsing:
        option_parser, opts, args =\
                     parse_command_line_parameters(**script_info)
    else:
        parser = OptionParser(option_list=options)
        opts, args = parser.parse_args()

    ids = [l.strip() for l in open(opts.ids_fp)]
    biom_str = open(opts.biom_fp).read()

    idxs, new_axis_md = get_axis_indices(biom_str, ids, opts.axis)
    new_data = direct_slice_data(biom_str, idxs, opts.axis)
    output = open(opts.output_fp,'w')

    # multiple walks over the file. bad form, but easy right now
    # ...should add a yield_and_ignore parser or something.
    output.write('{')
    output.write(direct_parse_key(biom_str, "id"))
    output.write(",")
    output.write(direct_parse_key(biom_str, "format"))
    output.write(",")
    output.write(direct_parse_key(biom_str, "format_url"))
    output.write(",")
    output.write(direct_parse_key(biom_str, "type"))
    output.write(",")
    output.write(direct_parse_key(biom_str, "generated_by"))
    output.write(",")