예제 #1
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    biom_table_fp = opts.biom_table_fp
    mapping_fp = opts.mapping_fp
    fields = opts.fields.split(',')
    output_dir = opts.output_dir
    suppress_mf = opts.suppress_mapping_file_output
    # column_rename_ids = opts.column_rename_ids
    # include_repeat_cols = opts.include_repeat_cols

    bt = load_table(biom_table_fp)
    mdata, mheaders, mcomments = parse_mapping_file(mapping_fp)
    mdata = array(mdata)

    # check that biom file and mapping file have matching sample names. discard
    # those samples that do not appear in both. 
    shared_samples = list(set(mdata[:, 0]).intersection(bt.ids(axis='sample')))
    if len(shared_samples) == 0:
        raise ValueError('Mapping file and biom table share no samples.')
    elif len(shared_samples) == len(mdata[:, 0]):
        mdata = array(mdata)
    else:
        # we want to preserve the order of the samples in the biom table
        ss_bt_order = [s for s in bt.ids(axis='sample') if s in
                       shared_samples]
        bt = bt.filter(ss_bt_order, axis='sample', inplace=True)
        mdata = subset_mapping_data(mdata, shared_samples)
    # check that headers in mapping data
    if not all([i in mheaders for i in fields]):
        raise ValueError('One or more of the specified fields was not found ' +\
                         'in the mapping file.')

    # create output directory and create base names
    create_dir(output_dir)
    mf_base_name = join(output_dir, splitext(split(mapping_fp)[1])[0])
    bt_base_name = join(output_dir, splitext(split(biom_table_fp)[1])[0])

    # run code and append output
    sample_groups, value_groups = make_non_empty_sample_lists(fields, mheaders,
                                                              mdata)

    for sg, vg in zip(sample_groups, value_groups):
        name_base = '__' + '%s_%s_' * len(vg) + '_'
        name_tmp = []
        for f, v in zip(fields, vg):
            name_tmp.extend([f, v])
        nb = name_base % tuple(name_tmp)

        tmp_mf_data = subset_mapping_data(mdata, sg)
        tmp_mf_str = format_mapping_file(mheaders, tmp_mf_data, mcomments)
        write_biom_table(bt.filter(sg, axis='sample', inplace=False),
                         bt_base_name + nb + '.biom')
        
        if not suppress_mf:
            o = open(mf_base_name + nb + '.txt', 'w')
            o.writelines(tmp_mf_str)
            o.close()
예제 #2
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    biom_table_fp = opts.biom_table_fp
    mapping_fp = opts.mapping_fp
    fields = opts.fields.split(',')
    output_dir = opts.output_dir
    suppress_mf = opts.suppress_mapping_file_output
    # column_rename_ids = opts.column_rename_ids
    # include_repeat_cols = opts.include_repeat_cols

    bt = load_table(biom_table_fp)
    mdata, mheaders, mcomments = parse_mapping_file(mapping_fp)
    mdata = array(mdata)

    # check that biom file and mapping file have matching sample names. discard
    # those samples that do not appear in both. 
    shared_samples = list(set(mdata[:, 0]).intersection(bt.ids(axis='sample')))
    if len(shared_samples) == 0:
        raise ValueError('Mapping file and biom table share no samples.')
    elif len(shared_samples) == len(mdata[:, 0]):
        mdata = array(mdata)
    else:
        # we want to preserve the order of the samples in the biom table
        ss_bt_order = [s for s in bt.ids(axis='sample') if s in
                       shared_samples]
        bt = bt.filter(ss_bt_order, axis='sample', inplace=True)
        mdata = subset_mapping_data(mdata, shared_samples)
    # check that headers in mapping data
    if not all([i in mheaders for i in fields]):
        raise ValueError('One or more of the specified fields was not found ' +\
                         'in the mapping file.')

    # create output directory and create base names
    create_dir(output_dir)
    mf_base_name = join(output_dir, splitext(split(mapping_fp)[1])[0])
    bt_base_name = join(output_dir, splitext(split(biom_table_fp)[1])[0])

    # run code and append output
    sample_groups, value_groups = make_non_empty_sample_lists(fields, mheaders,
                                                              mdata)

    for sg, vg in zip(sample_groups, value_groups):
        name_base = '__' + '%s_%s_' * len(vg) + '_'
        name_tmp = []
        for f, v in zip(fields, vg):
            name_tmp.extend([f, v])
        nb = name_base % tuple(name_tmp)

        tmp_mf_data = subset_mapping_data(mdata, sg)
        tmp_mf_str = format_mapping_file(mheaders, tmp_mf_data, mcomments)
        write_biom_table(bt.filter(sg, axis='sample', inplace=False),
                         bt_base_name + nb + '.biom')
        
        if not suppress_mf:
            o = open(mf_base_name + nb + '.txt', 'w')
            o.writelines(tmp_mf_str)
            o.close()
예제 #3
0
 def test_subset_mapping_data(self):
     """Test that mapping data is subset correctly."""
     samples_of_interest = ['s0', 's1', 's2']
     obs = subset_mapping_data(self.mdata, samples_of_interest)
     exp = array([['s0', 'blue', 'hot', '13'], ['s1', 'blue', 'cold', '1'],
                  ['s2', 'green', 'cold', '12']],
                 dtype='|S5')
     assert_array_equal(obs, exp)
     samples_of_interest = ['s1', 's2',
                            's0']  #order change won't affect out
     obs = subset_mapping_data(self.mdata, samples_of_interest)
     assert_array_equal(obs, exp)
     samples_of_interest = ['s4', 's0']  #order change won't affect out
     obs = subset_mapping_data(self.mdata, samples_of_interest)
     exp = array([['s0', 'blue', 'hot', '13'], ['s4', 'blue', '0', '0']],
                 dtype='|S5')
     assert_array_equal(obs, exp)
예제 #4
0
 def test_subset_mapping_data(self):
     """Test that mapping data is subset correctly."""
     samples_of_interest = ['s0', 's1', 's2']
     obs = subset_mapping_data(self.mdata, samples_of_interest)
     exp = array([['s0', 'blue', 'hot', '13'],
                  ['s1', 'blue', 'cold', '1'],
                  ['s2', 'green', 'cold', '12']],
                 dtype='|S5')
     assert_array_equal(obs, exp)
     samples_of_interest = ['s1', 's2', 's0'] #order change won't affect out
     obs = subset_mapping_data(self.mdata, samples_of_interest)
     assert_array_equal(obs, exp)
     samples_of_interest = ['s4', 's0'] #order change won't affect out
     obs = subset_mapping_data(self.mdata, samples_of_interest)
     exp = array([['s0', 'blue', 'hot', '13'],
                  ['s4', 'blue', '0', '0']],
                 dtype='|S5')
     assert_array_equal(obs, exp)