Exemplo n.º 1
0
 def test_get_sample_cats(self):
     """Test get_sample_cats."""
     pmf_in = \
        {'Sample1': {'test_cat': 'cat1', 'test_corr': '1', 'test_empty': 'abc'},
         'Sample2': {'test_cat': 'cat1', 'test_corr': '1', 'test_empty': 'abc'},
         'Sample3': {'test_cat': 'cat2', 'test_corr': '1', 'test_empty': ''},
         'Sample4': {'test_cat': 'cat2', 'test_corr': '1', 'test_empty': ''},
         'Sample5': {'test_cat': 'cat3', 'test_corr': '1', 'test_empty': 'abc'},
         'Sample6': {'test_cat': 'cat3', 'test_corr': '1', 'test_empty': ''}}
     # test with properly formatted, ordered parsed mapping file
     category = 'test_cat'
     exp = {'Sample1': 'cat1',
     'Sample2': 'cat1',
     'Sample3': 'cat2',
     'Sample4': 'cat2',
     'Sample5': 'cat3',
     'Sample6': 'cat3'}
     obs = get_sample_cats(pmf_in, category)
     self.assertEqual(exp, obs)
     # test with only one category
     category = 'test_corr'
     exp = {'Sample1': '1',
     'Sample2': '1',
     'Sample3': '1',
     'Sample4': '1',
     'Sample5': '1',
     'Sample6': '1'}
     obs = get_sample_cats(pmf_in, category)
     self.assertEqual(exp, obs)
     # test with empty categories
     category = 'test_empty'
     exp = {'Sample1': 'abc',
     'Sample2': 'abc',
     'Sample5': 'abc'}
     obs = get_sample_cats(pmf_in, category)
     self.assertEqual(exp, obs)
Exemplo n.º 2
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    # sync the mapping file and the biom file
    tmp_bt = load_table(opts.otu_table_fp)
    tmp_pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp)
    pmf, bt, nonshared_samples = sync_biom_and_mf(tmp_pmf, tmp_bt)

    # test error conditions for overlapping mf and bt
    if not opts.biom_samples_are_superset:
        # user indicates biom sample should be subset of mapping file samples
        if any([i in nonshared_samples for i in tmp_bt.ids()]):
            raise ValueError('The samples in the biom table are a superset of' +
                             ' the samples in the mapping file. The script will abort in' +
                             ' this case even though the calculations wouldn\'t be' +
                             ' affected, to ensure consistency within QIIME. Pass the' +
                             ' --biom_samples_are_superset option to disable this behavior.')
    # user wants non-overlapping samples printed out
    if opts.print_non_overlap:
        print 'The following samples were not shared between the mapping file' +\
            ' and the biom file and will not be included in the analysis:\n' +\
            ' '.join(nonshared_samples)

    # find group indices
    sam_cats = get_sample_cats(pmf, opts.category)
    cat_sam_groups = get_cat_sample_groups(sam_cats)
    cat_sam_indices = get_sample_indices(cat_sam_groups, bt)

    # sanity check to prevent inscrutable errors later
    if not all([len(v) > 0 for k, v in cat_sam_indices.items()]):
        raise ValueError('At least one metadata group has no samples. Check ' +
                         'that the mapping file has at least one sample for each value in ' +
                         'the passed category.')
    if opts.test in TWO_GROUP_TESTS and len(cat_sam_indices) > 2:
        option_parser.error('The t-test and mann_whitney_u test may ' +
                            'only be used when there are two sample groups. Choose another ' +
                            'test or another metadata category.')

    # check that assumptions are met for a given test:
    if opts.test == 'mann_whitney_u':
        sams = reduce(lambda x, y: len(x) + len(y), cat_sam_indices.values())
        if sams <= 20:
            raise ValueError('The number of samples is too small to use the ' +
                             'Mann-Whitney-U normal approximation. Review the script ' +
                             'documentation.')

    # check that the G-test was not selected if the table appears to be
    # relative abundance
    if opts.test == 'g_test':
        if allclose(bt.sum(axis='sample'), 1.) or (bt.sum(axis='whole') == 1.):
            raise ValueError('It appears that the biom table you have passed '
                'is a relative abundance table where values i,j (obsevation i '
                'count in sample j) are fractional and the sum of the columns '
                'is 1.0. This will fail to work properly with the G-test. If '
                'your data sums to 1 in each column but your data is not '
                'relative abundance then the tests will fail anyway because '
                'of the reduced number of observations.')

    # run actual tests
    data_feed = group_significance_row_generator(bt, cat_sam_indices)
    test_stats, pvals, means = run_group_significance_test(
        data_feed, opts.test,
        GROUP_TEST_CHOICES, int(opts.permutations))

    # calculate corrected pvals
    fdr_pvals = array(benjamini_hochberg_step_down(pvals))
    bon_pvals = bonferroni_correction(pvals)
    # correct for cases where values above 1.0 due to correction
    fdr_pvals = where(fdr_pvals > 1.0, 1.0, fdr_pvals)
    bon_pvals = where(bon_pvals > 1.0, 1.0, bon_pvals)

    # write output results after sorting
    lines = group_significance_output_formatter(bt, test_stats, pvals,
                                                fdr_pvals, bon_pvals, means, cat_sam_indices, md_key=opts.metadata_key)
    lines = sort_by_pval(lines, ind=2)
    o = open(opts.output_fp, 'w')
    o.writelines('\n'.join(lines))
    o.close()
Exemplo n.º 3
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    # sync the mapping file and the biom file
    tmp_bt = load_table(opts.otu_table_fp)
    tmp_pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp)
    pmf, bt, nonshared_samples = sync_biom_and_mf(tmp_pmf, tmp_bt)

    # test error conditions for overlapping mf and bt
    if not opts.biom_samples_are_superset:
        # user indicates biom sample should be subset of mapping file samples
        if any([i in nonshared_samples for i in tmp_bt.ids()]):
            raise ValueError(
                'The samples in the biom table are a superset of' +
                ' the samples in the mapping file. The script will abort in' +
                ' this case even though the calculations wouldn\'t be' +
                ' affected, to ensure consistency within QIIME. Pass the' +
                ' --biom_samples_are_superset option to disable this behavior.'
            )
    # user wants non-overlapping samples printed out
    if opts.print_non_overlap:
        print 'The following samples were not shared between the mapping file' +\
            ' and the biom file and will not be included in the analysis:\n' +\
            ' '.join(nonshared_samples)

    # find group indices
    sam_cats = get_sample_cats(pmf, opts.category)
    cat_sam_groups = get_cat_sample_groups(sam_cats)
    cat_sam_indices = get_sample_indices(cat_sam_groups, bt)

    # sanity check to prevent inscrutable errors later
    if not all([len(v) > 0 for k, v in cat_sam_indices.items()]):
        raise ValueError(
            'At least one metadata group has no samples. Check ' +
            'that the mapping file has at least one sample for each value in '
            + 'the passed category.')
    if opts.test in TWO_GROUP_TESTS and len(cat_sam_indices) > 2:
        option_parser.error(
            'The t-test and mann_whitney_u test may ' +
            'only be used when there are two sample groups. Choose another ' +
            'test or another metadata category.')

    # check that assumptions are met for a given test:
    if opts.test == 'mann_whitney_u':
        sams = reduce(lambda x, y: len(x) + len(y), cat_sam_indices.values())
        if sams <= 20:
            raise ValueError(
                'The number of samples is too small to use the ' +
                'Mann-Whitney-U normal approximation. Review the script ' +
                'documentation.')

    # check that the G-test was not selected if the table appears to be
    # relative abundance
    if opts.test == 'g_test':
        if allclose(bt.sum(axis='sample'), 1.) or (bt.sum(axis='whole') == 1.):
            raise ValueError(
                'It appears that the biom table you have passed '
                'is a relative abundance table where values i,j (obsevation i '
                'count in sample j) are fractional and the sum of the columns '
                'is 1.0. This will fail to work properly with the G-test. If '
                'your data sums to 1 in each column but your data is not '
                'relative abundance then the tests will fail anyway because '
                'of the reduced number of observations.')

    # run actual tests
    data_feed = group_significance_row_generator(bt, cat_sam_indices)
    test_stats, pvals, means = run_group_significance_test(
        data_feed, opts.test, GROUP_TEST_CHOICES, int(opts.permutations))

    # calculate corrected pvals
    fdr_pvals = array(benjamini_hochberg_step_down(pvals))
    bon_pvals = bonferroni_correction(pvals)
    # correct for cases where values above 1.0 due to correction
    fdr_pvals = where(fdr_pvals > 1.0, 1.0, fdr_pvals)
    bon_pvals = where(bon_pvals > 1.0, 1.0, bon_pvals)

    # write output results after sorting
    lines = group_significance_output_formatter(bt,
                                                test_stats,
                                                pvals,
                                                fdr_pvals,
                                                bon_pvals,
                                                means,
                                                cat_sam_indices,
                                                md_key=opts.metadata_key)
    lines = sort_by_pval(lines, ind=2)
    o = open(opts.output_fp, 'w')
    o.writelines('\n'.join(lines))
    o.close()