示例#1
0
    def setUp(self):
        self.otu_map1 = [('0',['seq1','seq2','seq5']),
                         ('1',['seq3','seq4']),
                         ('2',['seq6','seq7','seq8'])]
        self.tmp_fp1 = get_tmp_filename(prefix='FormatTests_',suffix='.txt')
        self.tmp_fp2 = get_tmp_filename(prefix='FormatTests_',suffix='.txt')
        self.files_to_remove = []

        self.add_taxa_summary = {'s1':[1,2],'s2':[3,4]}
        self.add_taxa_header = ['sample_id','foo','bar']
        self.add_taxa_order = ['a;b;c','d;e;f']
        self.add_taxa_mapping = [['s1','something1','something2'],
                                 ['s2','something3','something4'],
                                 ['s3','something5','something6']]
        self.biom1 = parse_biom_table(biom1.split('\n'))
        
        self.expected_formatted_html_no_errors_warnings =\
         expected_formatted_html_no_errors_warnings
        self.expected_formatted_html_errors =\
         expected_formatted_html_errors
        self.expected_formatted_html_warnings =\
         expected_formatted_html_warnings
        self.expected_formatted_html_data_nonloc_error =\
         expected_formatted_html_data_nonloc_error

        # For testing formatting of correlation vectors.
        self.corr_vec1 = [('S1', 'T1', 0.7777777777, 0, 0, 0, 0, (0.5, 1.0))]
        self.corr_vec2 = [('S1', 'T1', 0.7777777777, 0, 0, 0, 0, (0.5, 1.0)),
                          ('S2', 'T2', 0.1, 0.05, 0.15, 0.04, 0.12,
                           (-0.1, 0.2)),
                          ('S3', 'T3', 100.68, 0.9, 1, 1, 1, (-0.4, -0.2))]
        self.corr_vec3 = [('S1', 'T1', 0.7777777777, 0, 0, 0, 0, (None, None))]

        # For testing statistical method formatters.
        self.overview_dm = DistanceMatrix.parseDistanceMatrix(
                overview_dm_lines)
        self.overview_map = MetadataMap.parseMetadataMap(overview_map_lines)

        self.soils_dm = DistanceMatrix.parseDistanceMatrix(soils_dm_lines)
        self.soils_map = MetadataMap.parseMetadataMap(soils_map_lines)

        self.anosim_overview = Anosim(self.overview_map, self.overview_dm,
                                      'Treatment')
        self.permanova_overview = Permanova(self.overview_map,
                                            self.overview_dm, 'Treatment')
        self.best_overview = Best(self.overview_dm, self.overview_map, ['DOB'])
        self.best_88_soils = Best(self.soils_dm, self.soils_map,
                ['TOT_ORG_CARB', 'SILT_CLAY', 'ELEVATION',
                 'SOIL_MOISTURE_DEFICIT', 'CARB_NITRO_RATIO',
                 'ANNUAL_SEASON_TEMP', 'ANNUAL_SEASON_PRECPT', 'PH',
                 'CMIN_RATE', 'LONGITUDE', 'LATITUDE'])
示例#2
0
def check_mapping_file_category(loaded_biom, mapping_fp, mapping_category,
                                subcategory_1, subcategory_2):
    #remove mapping file samples that are not in the input BIOM table
    with open(mapping_fp, 'U') as map_f:
        md_map = MetadataMap.parseMetadataMap(map_f)
    md_map.filterSamples(loaded_biom.ids(axis='sample'), strict=True)

    if mapping_category not in md_map.CategoryNames:
        raise ValueError("category '%s' not found in mapping file "
                         "columns." % mapping_category)

    all_subcategories = md_map.getCategoryValues(md_map.sample_ids,
                                                 mapping_category)

    if subcategory_1 not in all_subcategories:
        raise ValueError("subcategory_1 (-x) '%s' not found in selected "
                         "mapping file column." % subcategory_1)

    if subcategory_2 not in all_subcategories:
        raise ValueError("subcategory_2 (-y) '%s' not found in selected "
                         "mapping file column." % subcategory_2)

    if subcategory_2 == subcategory_1:
        raise ValueError(
            "subcategory_1 (-x) must be different from subcategory_2 (-y)")
示例#3
0
def choose_cluster_subsets(otu_table_f, map_f, category, num_total_samples):
    otu_table = parse_biom_table(otu_table_f)
    metadata_map = MetadataMap.parseMetadataMap(map_f)

    # Dirty... :(
    try:
        map_f.seek(0)
    except AttributeError:
        pass

    if num_total_samples > len(otu_table.SampleIds):
        raise InvalidSubsetSize("Too many total samples (%d) were specified "
                                "as a subset size. There are only %d total "
                                "samples to choose a subset from." %
                                (num_total_samples, len(otu_table.SampleIds)))

    category_map = defaultdict(list)
    for samp_id in metadata_map.SampleIds:
        # Mapping files can have more samples than OTU tables.
        if samp_id in otu_table.SampleIds:
            category_val = metadata_map.getCategoryValue(samp_id, category)
            category_map[category_val].append(samp_id)

    samp_ids_to_keep, extra_samps = _choose_items_from_clusters(
            category_map, otu_table.SampleIds, num_total_samples)
    samp_ids_to_keep.extend(extra_samps)

    assert len(samp_ids_to_keep) == num_total_samples, \
           "%d != %d" % (len(samp_ids_to_keep), num_total_samples)
    assert len(samp_ids_to_keep) == len(set(samp_ids_to_keep)), \
           "Duplicate sample IDs in subset"

    return (filter_samples_from_otu_table(otu_table, samp_ids_to_keep, 0, inf),
            filter_mapping_file_from_mapping_f(map_f, samp_ids_to_keep))
示例#4
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    verbose = opts.verbose
    output_fp = opts.output_fp
    mapping_files = [open(fp, 'U') for fp in opts.mapping_fps]
    no_data_value = opts.no_data_value

    mapping_data = MetadataMap.mergeMappingFiles(mapping_files,
                                                 no_data_value=no_data_value)

    with open(output_fp, 'w') as f:
        f.write(str(mapping_data))
示例#5
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    verbose = opts.verbose
    output_fp = opts.output_fp
    mapping_files = [open(fp, 'U') for fp in opts.mapping_fps]
    no_data_value = opts.no_data_value

    mapping_data = MetadataMap.mergeMappingFiles(mapping_files,
                                                 no_data_value=no_data_value)

    with open(output_fp, 'w') as f:
        f.write(str(mapping_data))
示例#6
0
def _generate_alpha_diversity_boxplots(collated_adiv_dir, map_fp,
                                       split_category, comparison_category,
                                       rarefaction_depth, output_dir):
    """Generates per-body-site self vs. other alpha diversity boxplots.

    Creates a plot for each input collated alpha diversity file (i.e. metric)
    in collated_adiv_dir. Returns a list of plot filenames that were created in
    output_dir.

    Arguments:
        collated_adiv_dir - path to directory containing one or more collated
            alpha diversity files
        map_fp - filepath to metadata mapping file
        split_category - category to split on, e.g. body site. A boxplot will
            be created for each category value (e.g. tongue, palm, etc.)
        comparison_category - category to split on within each of the split
            categories (e.g. self, other)
        rarefaction_depth - rarefaction depth to use when pulling data from
            rarefaction files
        output_dir - directory to write output plot images to
    """
    metadata_map = MetadataMap.parseMetadataMap(open(map_fp, 'U'))
    collated_adiv_fps = glob(join(collated_adiv_dir, '*.txt'))
    plot_title = 'Alpha diversity (%d seqs/sample)' % rarefaction_depth

    # Generate a plot for each collated alpha diversity metric file.
    created_files = []
    for collated_adiv_fp in collated_adiv_fps:
        adiv_metric = splitext(basename(collated_adiv_fp))[0]

        x_tick_labels, dists = _collect_alpha_diversity_boxplot_data(
                open(collated_adiv_fp, 'U'), metadata_map, rarefaction_depth,
                split_category, comparison_category)

        plot_figure = generate_box_plots(dists,
                                         x_tick_labels=x_tick_labels,
                                         title=plot_title,
                                         x_label='Grouping',
                                         y_label=format_title(adiv_metric))
        plot_fp = join(output_dir, '%s.png' % adiv_metric)
        plot_figure.savefig(plot_fp)
        created_files.append(basename(plot_fp))

    return created_files
示例#7
0
def subset_groups(dm_f, map_f, category, max_group_size):
    dm_labels, dm_data = parse_distmat(dm_f)
    metadata_map = MetadataMap.parseMetadataMap(map_f)

    category_map = defaultdict(list)
    for samp_id in metadata_map.SampleIds:
        # Mapping files can have more samples than distance matrices, which can
        # happen in this case since we are dealing with rarefied OTU tables
        # (samples get dropped).
        if samp_id in dm_labels:
            category_val = metadata_map.getCategoryValue(samp_id, category)
            category_map[category_val].append(samp_id)

    samp_ids_to_keep = []
    for category_val, samp_ids in category_map.items():
        samp_ids_to_keep.extend(
                sample(samp_ids, min(max_group_size, len(samp_ids))))

    return filter_samples_from_distance_matrix((dm_labels, dm_data),
                                               samp_ids_to_keep, negate=True)
def check_mapping_file_category(loaded_biom, mapping_fp, mapping_category, subcategory_1, subcategory_2):
    #remove mapping file samples that are not in the input BIOM table
    with open(mapping_fp, 'U') as map_f:
        md_map = MetadataMap.parseMetadataMap(map_f)
    md_map.filterSamples(loaded_biom.ids(axis='sample'), strict=True)

    if mapping_category not in md_map.CategoryNames:
        raise ValueError("category '%s' not found in mapping file "
                         "columns." % mapping_category)

    all_subcategories = md_map.getCategoryValues(md_map.sample_ids, mapping_category)

    if subcategory_1 not in all_subcategories:
        raise ValueError("subcategory_1 (-x) '%s' not found in selected "
                         "mapping file column." % subcategory_1)

    if subcategory_2 not in all_subcategories:
        raise ValueError("subcategory_2 (-y) '%s' not found in selected "
                         "mapping file column." % subcategory_2)

    if subcategory_2 == subcategory_1:
        raise ValueError("subcategory_1 (-x) must be different from subcategory_2 (-y)")
def run_core_diversity_analyses(biom_fp,
                                mapping_fp,
                                sampling_depth,
                                output_dir,
                                qiime_config,
                                command_handler=call_commands_serially,
                                tree_fp=None,
                                params=None,
                                categories=None,
                                arare_min_rare_depth=10,
                                arare_num_steps=10,
                                parallel=False,
                                suppress_taxa_summary=False,
                                suppress_beta_diversity=False,
                                suppress_alpha_diversity=False,
                                suppress_group_significance=False,
                                status_update_callback=print_to_stdout):
    """
    """
    if categories is not None:
        # Validate categories provided by the users
        mapping_data, mapping_comments = \
            parse_mapping_file_to_dict(open(mapping_fp, 'U'))
        metadata_map = MetadataMap(mapping_data, mapping_comments)
        for c in categories:
            if c not in metadata_map.CategoryNames:
                raise ValueError(
                    "Category '%s' is not a column header "
                    "in your mapping file. "
                    "Categories are case and white space sensitive. Valid "
                    "choices are: (%s)" %
                    (c, ', '.join(metadata_map.CategoryNames)))
            if metadata_map.hasSingleCategoryValue(c):
                raise ValueError(
                    "Category '%s' contains only one value. "
                    "Categories analyzed here require at least two values." %
                    c)

    else:
        categories = []
    comma_separated_categories = ','.join(categories)
    # prep some variables
    if params is None:
        params = parse_qiime_parameters([])

    create_dir(output_dir)
    index_fp = '%s/index.html' % output_dir
    index_links = []
    commands = []

    # begin logging
    old_log_fps = glob(join(output_dir, 'log_20*txt'))
    log_fp = generate_log_fp(output_dir)
    index_links.append(
        ('Master run log', log_fp, _index_headers['run_summary']))
    for old_log_fp in old_log_fps:
        index_links.append(
            ('Previous run log', old_log_fp, _index_headers['run_summary']))
    logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config)
    input_fps = [biom_fp, mapping_fp]
    if tree_fp is not None:
        input_fps.append(tree_fp)
    log_input_md5s(logger, input_fps)

    # run 'biom summarize-table' on input BIOM table
    try:
        params_str = get_params_str(params['biom-summarize-table'])
    except KeyError:
        params_str = ''
    biom_table_stats_output_fp = '%s/biom_table_summary.txt' % output_dir
    if not exists(biom_table_stats_output_fp):
        biom_table_summary_cmd = \
            "biom summarize-table -i %s -o %s %s" % \
            (biom_fp, biom_table_stats_output_fp, params_str)
        commands.append([('Generate BIOM table summary',
                          biom_table_summary_cmd)])
    else:
        logger.write("Skipping 'biom summarize-table' as %s exists.\n\n" %
                     biom_table_stats_output_fp)
    index_links.append(('BIOM table statistics', biom_table_stats_output_fp,
                        _index_headers['run_summary']))

    # filter samples with fewer observations than the requested sampling_depth.
    # since these get filtered for some analyses (eg beta diversity after
    # even sampling) it's useful to filter them here so they're filtered
    # from all analyses.
    filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth)
    if not exists(filtered_biom_fp):
        filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" %\
            (biom_fp, filtered_biom_fp, sampling_depth)
        commands.append([(
            'Filter low sequence count samples from table (minimum sequence count: %d)'
            % sampling_depth, filter_samples_cmd)])
    else:
        logger.write(
            "Skipping filter_samples_from_otu_table.py as %s exists.\n\n" %
            filtered_biom_fp)
    biom_fp = filtered_biom_fp

    # rarify the BIOM table to sampling_depth
    rarefied_biom_fp = "%s/table_even%d.biom" % (output_dir, sampling_depth)
    if not exists(rarefied_biom_fp):
        single_rarefaction_cmd = "single_rarefaction.py -i %s -o %s -d %d" %\
            (biom_fp, rarefied_biom_fp, sampling_depth)
        commands.append([
            ('Rarify the OTU table to %d sequences/sample' % sampling_depth,
             single_rarefaction_cmd)
        ])
    else:
        logger.write("Skipping single_rarefaction.py as %s exists.\n\n" %
                     rarefied_biom_fp)

    # run initial commands and reset the command list
    if len(commands) > 0:
        command_handler(commands,
                        status_update_callback,
                        logger,
                        close_logger_on_success=False)
        commands = []

    if not suppress_beta_diversity:
        bdiv_even_output_dir = '%s/bdiv_even%d/' % (output_dir, sampling_depth)
        # Need to check for the existence of any distance matrices, since the user
        # can select which will be generated.
        existing_dm_fps = glob('%s/*_dm.txt' % bdiv_even_output_dir)
        if len(existing_dm_fps) == 0:
            even_dm_fps = run_beta_diversity_through_plots(
                otu_table_fp=rarefied_biom_fp,
                mapping_fp=mapping_fp,
                output_dir=bdiv_even_output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                # Note: we pass sampling depth=None here as
                # we rarify the BIOM table above and pass that
                # in here.
                sampling_depth=None,
                tree_fp=tree_fp,
                parallel=parallel,
                logger=logger,
                suppress_md5=True,
                status_update_callback=status_update_callback)
        else:
            logger.write(
                "Skipping beta_diversity_through_plots.py as %s exist(s).\n\n"
                % ', '.join(existing_dm_fps))
            even_dm_fps = [(split(fp)[1].strip('_dm.txt'), fp)
                           for fp in existing_dm_fps]

        # Get make_distance_boxplots parameters
        try:
            params_str = get_params_str(params['make_distance_boxplots'])
        except KeyError:
            params_str = ''

        for bdiv_metric, dm_fp in even_dm_fps:
            for category in categories:
                boxplots_output_dir = '%s/%s_boxplots/' % (
                    bdiv_even_output_dir, bdiv_metric)
                plot_output_fp = '%s/%s_Distances.pdf' % (boxplots_output_dir,
                                                          category)
                stats_output_fp = '%s/%s_Stats.txt' % (boxplots_output_dir,
                                                       category)
                if not exists(plot_output_fp):
                    boxplots_cmd = \
                        'make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s' %\
                        (dm_fp, category, boxplots_output_dir,
                         mapping_fp, params_str)
                    commands.append([('Boxplots (%s)' % category, boxplots_cmd)
                                     ])
                else:
                    logger.write(
                        "Skipping make_distance_boxplots.py for %s as %s exists.\n\n"
                        % (category, plot_output_fp))
                index_links.append(
                    ('Distance boxplots (%s)' % bdiv_metric, plot_output_fp,
                     _index_headers['beta_diversity_even'] % sampling_depth))
                index_links.append(
                    ('Distance boxplots statistics (%s)' % bdiv_metric,
                     stats_output_fp,
                     _index_headers['beta_diversity_even'] % sampling_depth))

            index_links.append(
                ('PCoA plot (%s)' % bdiv_metric,
                 '%s/%s_emperor_pcoa_plot/index.html' %
                 (bdiv_even_output_dir, bdiv_metric),
                 _index_headers['beta_diversity_even'] % sampling_depth))
            index_links.append(
                ('Distance matrix (%s)' % bdiv_metric,
                 '%s/%s_dm.txt' % (bdiv_even_output_dir, bdiv_metric),
                 _index_headers['beta_diversity_even'] % sampling_depth))
            index_links.append(
                ('Principal coordinate matrix (%s)' % bdiv_metric,
                 '%s/%s_pc.txt' % (bdiv_even_output_dir, bdiv_metric),
                 _index_headers['beta_diversity_even'] % sampling_depth))

    if not suppress_alpha_diversity:
        # Alpha rarefaction workflow
        arare_full_output_dir = '%s/arare_max%d/' % (output_dir,
                                                     sampling_depth)
        rarefaction_plots_output_fp = \
            '%s/alpha_rarefaction_plots/rarefaction_plots.html' % arare_full_output_dir
        if not exists(rarefaction_plots_output_fp):
            run_alpha_rarefaction(
                otu_table_fp=biom_fp,
                mapping_fp=mapping_fp,
                output_dir=arare_full_output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                tree_fp=tree_fp,
                num_steps=arare_num_steps,
                parallel=parallel,
                logger=logger,
                min_rare_depth=arare_min_rare_depth,
                max_rare_depth=sampling_depth,
                suppress_md5=True,
                status_update_callback=status_update_callback,
                retain_intermediate_files=False)
        else:
            logger.write("Skipping alpha_rarefaction.py as %s exists.\n\n" %
                         rarefaction_plots_output_fp)

        index_links.append(
            ('Alpha rarefaction plots', rarefaction_plots_output_fp,
             _index_headers['alpha_diversity']))

        collated_alpha_diversity_fps = \
            glob('%s/alpha_div_collated/*txt' % arare_full_output_dir)
        try:
            params_str = get_params_str(params['compare_alpha_diversity'])
        except KeyError:
            params_str = ''

        if len(categories) > 0:
            for collated_alpha_diversity_fp in collated_alpha_diversity_fps:
                alpha_metric = splitext(
                    split(collated_alpha_diversity_fp)[1])[0]
                compare_alpha_output_dir = '%s/compare_%s' % \
                    (arare_full_output_dir, alpha_metric)
                if not exists(compare_alpha_output_dir):
                    compare_alpha_cmd = \
                        'compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s' %\
                        (collated_alpha_diversity_fp,
                         mapping_fp,
                         comma_separated_categories,
                         compare_alpha_output_dir,
                         params_str)
                    commands.append([
                        ('Compare alpha diversity (%s)' % alpha_metric,
                         compare_alpha_cmd)
                    ])
                    for category in categories:
                        alpha_comparison_stat_fp = '%s/%s_stats.txt' % \
                            (compare_alpha_output_dir, category)
                        alpha_comparison_boxplot_fp = '%s/%s_boxplots.pdf' % \
                            (compare_alpha_output_dir, category)
                        index_links.append(
                            ('Alpha diversity statistics (%s, %s)' %
                             (category, alpha_metric),
                             alpha_comparison_stat_fp,
                             _index_headers['alpha_diversity']))
                        index_links.append(
                            ('Alpha diversity boxplots (%s, %s)' %
                             (category, alpha_metric),
                             alpha_comparison_boxplot_fp,
                             _index_headers['alpha_diversity']))
                else:
                    logger.write("Skipping compare_alpha_diversity.py"
                                 " for %s as %s exists.\n\n" %
                                 (alpha_metric, compare_alpha_output_dir))
        else:
            logger.write("Skipping compare_alpha_diversity.py as"
                         " no categories were provided.\n\n")

    if not suppress_taxa_summary:
        taxa_plots_output_dir = '%s/taxa_plots/' % output_dir
        # need to check for existence of any html files, since the user can
        # select only certain ones to be generated
        existing_taxa_plot_html_fps = glob(
            join(taxa_plots_output_dir, 'taxa_summary_plots', '*.html'))
        if len(existing_taxa_plot_html_fps) == 0:
            run_summarize_taxa_through_plots(
                otu_table_fp=biom_fp,
                mapping_fp=mapping_fp,
                output_dir=taxa_plots_output_dir,
                mapping_cat=None,
                sort=True,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                logger=logger,
                suppress_md5=True,
                status_update_callback=status_update_callback)
        else:
            logger.write(
                "Skipping summarize_taxa_through_plots.py for as %s exist(s).\n\n"
                % ', '.join(existing_taxa_plot_html_fps))

        index_links.append(
            ('Taxa summary bar plots',
             '%s/taxa_summary_plots/bar_charts.html' % taxa_plots_output_dir,
             _index_headers['taxa_summary']))
        index_links.append(
            ('Taxa summary area plots',
             '%s/taxa_summary_plots/area_charts.html' % taxa_plots_output_dir,
             _index_headers['taxa_summary']))
        for category in categories:
            taxa_plots_output_dir = '%s/taxa_plots_%s/' % (output_dir,
                                                           category)
            # need to check for existence of any html files, since the user can
            # select only certain ones to be generated
            existing_taxa_plot_html_fps = glob('%s/taxa_summary_plots/*.html' %
                                               taxa_plots_output_dir)
            if len(existing_taxa_plot_html_fps) == 0:
                run_summarize_taxa_through_plots(
                    otu_table_fp=biom_fp,
                    mapping_fp=mapping_fp,
                    output_dir=taxa_plots_output_dir,
                    mapping_cat=category,
                    sort=True,
                    command_handler=command_handler,
                    params=params,
                    qiime_config=qiime_config,
                    logger=logger,
                    suppress_md5=True,
                    status_update_callback=status_update_callback)
            else:
                logger.write(
                    "Skipping summarize_taxa_through_plots.py for %s as %s exist(s).\n\n"
                    % (category, ', '.join(existing_taxa_plot_html_fps)))

            index_links.append(
                ('Taxa summary bar plots',
                 '%s/taxa_summary_plots/bar_charts.html' %
                 taxa_plots_output_dir,
                 _index_headers['taxa_summary_categorical'] % category))
            index_links.append(
                ('Taxa summary area plots',
                 '%s/taxa_summary_plots/area_charts.html' %
                 taxa_plots_output_dir,
                 _index_headers['taxa_summary_categorical'] % category))

    if not suppress_group_significance:
        params_str = get_params_str(params['group_significance'])
        # group significance tests, aka category significance
        for category in categories:
            group_signifance_fp = \
                '%s/group_significance_%s.txt' % (output_dir, category)
            if not exists(group_signifance_fp):
                # Build the OTU cateogry significance command
                group_significance_cmd = \
                    'group_significance.py -i %s -m %s -c %s -o %s %s' %\
                    (rarefied_biom_fp, mapping_fp, category,
                     group_signifance_fp, params_str)
                commands.append([('Group significance (%s)' % category,
                                  group_significance_cmd)])
            else:
                logger.write(
                    "Skipping group_significance.py for %s as %s exists.\n\n" %
                    (category, group_signifance_fp))

            index_links.append(
                ('Category significance (%s)' % category, group_signifance_fp,
                 _index_headers['group_significance']))

    filtered_biom_gzip_fp = '%s.gz' % filtered_biom_fp
    if not exists(filtered_biom_gzip_fp):
        commands.append([('Compress the filtered BIOM table',
                          'gzip %s' % filtered_biom_fp)])
    else:
        logger.write(
            "Skipping compressing of filtered BIOM table as %s exists.\n\n" %
            filtered_biom_gzip_fp)
    index_links.append(
        ('Filtered BIOM table (minimum sequence count: %d)' % sampling_depth,
         filtered_biom_gzip_fp, _index_headers['run_summary']))

    rarified_biom_gzip_fp = '%s.gz' % rarefied_biom_fp
    if not exists(rarified_biom_gzip_fp):
        commands.append([('Compress the rarified BIOM table',
                          'gzip %s' % rarefied_biom_fp)])
    else:
        logger.write(
            "Skipping compressing of rarified BIOM table as %s exists.\n\n" %
            rarified_biom_gzip_fp)
    index_links.append(
        ('Rarified BIOM table (sampling depth: %d)' % sampling_depth,
         rarified_biom_gzip_fp, _index_headers['run_summary']))

    if len(commands) > 0:
        command_handler(commands, status_update_callback, logger)
    else:
        logger.close()

    generate_index_page(index_links, index_fp)
示例#10
0
def compare_categories(dm_fp, map_fp, method, categories, num_perms, out_dir):
    """Runs the specified statistical method using the category of interest.

    This method does not return anything; all output is written to results
    files in out_dir.

    Arguments:
        dm_fp - filepath to the input distance matrix
        map_fp - filepath to the input metadata mapping file
        categories - list of categories in the metadata mapping file to
            consider in the statistical test. Multiple categories will only be
            considered if method is 'best', otherwise only the first category
            will be considered
        num_perms - the number of permutations to use when calculating the
            p-value. If method is 'best' or 'morans_i', this parameter will be
            ignored as they are not permutation-based methods
        out_dir - path to the output directory where results files will be
            written. It is assumed that this directory already exists and we
            have write permissions to it
    """

    # Make sure we were passed a list of categories, not a single string.
    if not isinstance(categories, ListType):
        raise TypeError("The supplied categories must be a list of "
                        "strings.")

    # Special case: we do not allow SampleID as it is not a category, neither
    # in data structure representation nor in terms of a statistical test (no
    # groups are formed since all entries are unique IDs).
    if 'SampleID' in categories:
        raise ValueError("Cannot use SampleID as a category because it is a "
                         "unique identifier for each sample, and thus does "
                         "not create groups of samples (nor can it be used as "
                         "a numeric category in Moran's I or BEST analyses). "
                         "Please use a different metadata column to perform "
                         "statistical tests on.")

    # Parse the mapping file and distance matrix.
    md_map = MetadataMap.parseMetadataMap(open(map_fp, 'U'))
    dm = DistanceMatrix.parseDistanceMatrix(open(dm_fp, 'U'))

    # Remove any samples from the mapping file that aren't in the distance
    # matrix (important for validation checks). Use strict=True so that an
    # error is raised if the distance matrix contains any samples that aren't
    # in the mapping file.
    md_map.filterSamples(dm.SampleIds, strict=True)

    # Run the specified statistical method.
    if method in ['adonis', 'morans_i', 'mrpp', 'permdisp', 'dbrda']:
        # These methods are run in R. Input validation must be done here before
        # running the R commands. The pure-Python implementations perform all
        # validation in the classes in the stats module.

        # Make sure the input distance matrix is symmetric and hollow.
        if not dm.is_symmetric_and_hollow():
            raise ValueError("The distance matrix must be symmetric and "
                             "hollow.")

        # Check to make sure all categories passed in are in mapping file and
        # are not all the same value.
        for category in categories:
            if not category in md_map.CategoryNames:
                raise ValueError("Category '%s' not found in mapping file "
                                 "columns." % category)

            if md_map.hasSingleCategoryValue(category):
                raise ValueError("All values in category '%s' are the "
                                 "same. The statistical method '%s' cannot "
                                 "operate on a category that creates only "
                                 "a single group of samples (e.g. there "
                                 "are no 'between' distances because "
                                 "there is only a single group)." %
                                 (category, method))

        # Build the command arguments string.
        command_args = [
            '-d %s -m %s -c %s -o %s' % (dm_fp, map_fp, categories[0], out_dir)
        ]

        if method == 'morans_i':
            # Moran's I requires only numeric categories.
            for category in categories:
                if not md_map.isNumericCategory(category):
                    raise TypeError(
                        "The category '%s' is not numeric. Not "
                        "all values could be converted to numbers." % category)
        else:
            # The rest require groups of samples, so the category values cannot
            # all be unique.
            for category in categories:
                if md_map.hasUniqueCategoryValues(category):
                    raise ValueError("All values in category '%s' are unique. "
                                     "This statistical method cannot operate "
                                     "on a category with unique values (e.g. "
                                     "there are no 'within' distances because "
                                     "each group of samples contains only a "
                                     "single sample)." % category)

            # Only Moran's I doesn't accept a number of permutations.
            if num_perms < 0:
                raise ValueError("The number of permutations must be greater "
                                 "than or equal to zero.")

            command_args[0] += ' -n %d' % num_perms

        rex = RExecutor(TmpDir=get_qiime_temp_dir())
        rex(command_args, '%s.r' % method, output_dir=out_dir)
    elif method == 'anosim':
        anosim = Anosim(md_map, dm, categories[0])
        anosim_results = anosim(num_perms)

        out_f = open(join(out_dir, '%s_results.txt' % method), 'w+')
        out_f.write(format_anosim_results(anosim_results))
        out_f.close()
    elif method == 'best':
        best = Best(dm, md_map, categories)
        best_results = best()

        out_f = open(join(out_dir, '%s_results.txt' % method), 'w+')
        out_f.write(format_best_results(best_results))
        out_f.close()
    elif method == 'permanova':
        permanova = Permanova(md_map, dm, categories[0])
        permanova_results = permanova(num_perms)

        out_f = open(join(out_dir, '%s_results.txt' % method), 'w+')
        out_f.write(format_permanova_results(permanova_results))
        out_f.close()
    else:
        raise ValueError("Unrecognized method '%s'. Valid methods: %r" %
                         (method, methods))
示例#11
0
def compare_categories(dm_fp, map_fp, method, categories, num_perms, out_dir):
    """Runs the specified statistical method using the category of interest.

    This method does not return anything; all output is written to results
    files in out_dir.

    Arguments:
        dm_fp - filepath to the input distance matrix
        map_fp - filepath to the input metadata mapping file
        categories - list of categories in the metadata mapping file to
            consider in the statistical test. Multiple categories will only be
            considered if method is 'best', otherwise only the first category
            will be considered
        num_perms - the number of permutations to use when calculating the
            p-value. If method is 'best' or 'morans_i', this parameter will be
            ignored as they are not permutation-based methods
        out_dir - path to the output directory where results files will be
            written. It is assumed that this directory already exists and we
            have write permissions to it
    """

    # Make sure we were passed a list of categories, not a single string.
    if not isinstance(categories, ListType):
        raise TypeError("The supplied categories must be a list of "
                        "strings.")

    # Special case: we do not allow SampleID as it is not a category, neither
    # in data structure representation nor in terms of a statistical test (no
    # groups are formed since all entries are unique IDs).
    if 'SampleID' in categories:
        raise ValueError("Cannot use SampleID as a category because it is a "
                         "unique identifier for each sample, and thus does "
                         "not create groups of samples (nor can it be used as "
                         "a numeric category in Moran's I or BEST analyses). "
                         "Please use a different metadata column to perform "
                         "statistical tests on.")

    # Parse the mapping file and distance matrix.
    with open(map_fp, 'U') as map_f:
        md_map = MetadataMap.parseMetadataMap(map_f)

    with open(dm_fp, 'U') as dm_f:
        dm = SymmetricDistanceMatrix.from_file(dm_f)

    # Remove any samples from the mapping file that aren't in the distance
    # matrix (important for validation checks). Use strict=True so that an
    # error is raised if the distance matrix contains any samples that aren't
    # in the mapping file.
    md_map.filterSamples(dm.sample_ids, strict=True)

    # Run the specified statistical method.
    if method in ['adonis', 'morans_i', 'mrpp', 'permdisp', 'dbrda']:
        # These methods are run in R. Input validation must be done here before
        # running the R commands. The pure-Python implementations perform all
        # validation in the classes in the stats module.

        # Check to make sure all categories passed in are in mapping file and
        # are not all the same value.
        for category in categories:
            if not category in md_map.CategoryNames:
                raise ValueError("Category '%s' not found in mapping file "
                                 "columns." % category)

            if md_map.hasSingleCategoryValue(category):
                raise ValueError("All values in category '%s' are the "
                                 "same. The statistical method '%s' cannot "
                                 "operate on a category that creates only "
                                 "a single group of samples (e.g. there "
                                 "are no 'between' distances because "
                                 "there is only a single group)."
                                 % (category, method))

        # Build the command arguments string.
        command_args = ['-d %s -m %s -c %s -o %s'
                        % (dm_fp, map_fp, categories[0], out_dir)]

        if method == 'morans_i':
            # Moran's I requires only numeric categories.
            for category in categories:
                if not md_map.isNumericCategory(category):
                    raise TypeError("The category '%s' is not numeric. Not "
                                    "all values could be converted to numbers."
                                    % category)
        else:
            # The rest require groups of samples, so the category values cannot
            # all be unique.
            for category in categories:
                if md_map.hasUniqueCategoryValues(category):
                    raise ValueError("All values in category '%s' are unique. "
                                     "This statistical method cannot operate "
                                     "on a category with unique values (e.g. "
                                     "there are no 'within' distances because "
                                     "each group of samples contains only a "
                                     "single sample)." % category)

            # Only Moran's I doesn't accept a number of permutations.
            if num_perms < 0:
                raise ValueError("The number of permutations must be greater "
                                 "than or equal to zero.")

            command_args[0] += ' -n %d' % num_perms

        rex = RExecutor(TmpDir=get_qiime_temp_dir())
        rex(command_args, '%s.r' % method, output_dir=out_dir)
    elif method == 'anosim':
        anosim = Anosim(md_map, dm, categories[0])
        anosim_results = anosim(num_perms)

        out_f = open(join(out_dir, '%s_results.txt' % method), 'w+')
        out_f.write(format_anosim_results(anosim_results))
        out_f.close()
    elif method == 'best':
        best = Best(dm, md_map, categories)
        best_results = best()

        out_f = open(join(out_dir, '%s_results.txt' % method), 'w+')
        out_f.write(format_best_results(best_results))
        out_f.close()
    elif method == 'permanova':
        permanova = Permanova(md_map, dm, categories[0])
        permanova_results = permanova(num_perms)

        out_f = open(join(out_dir, '%s_results.txt' % method), 'w+')
        out_f.write(format_permanova_results(permanova_results))
        out_f.close()
    else:
        raise ValueError("Unrecognized method '%s'. Valid methods: %r"
                         % (method, methods))
示例#12
0
def compare_categories(dm_fp, map_fp, method, categories, num_perms, out_dir):
    """Runs the specified statistical method using the category of interest.

    This method does not return anything; all output is written to results
    files in out_dir.

    Arguments:
        dm_fp - filepath to the input distance matrix
        map_fp - filepath to the input metadata mapping file
        categories - list of categories in the metadata mapping file to
            consider in the statistical test. Multiple categories will only be
            considered if method is 'bioenv', otherwise only the first category
            will be considered
        num_perms - the number of permutations to use when calculating the
            p-value. If method is 'bioenv' or 'morans_i', this parameter will
            be ignored as they are not permutation-based methods
        out_dir - path to the output directory where results files will be
            written. It is assumed that this directory already exists and we
            have write permissions to it
    """
    # Make sure we were passed a list of categories, not a single string.
    if not isinstance(categories, ListType):
        raise TypeError("The supplied categories must be a list of "
                        "strings.")

    # Special case: we do not allow SampleID as it is not a category, neither
    # in data structure representation nor in terms of a statistical test (no
    # groups are formed since all entries are unique IDs).
    if 'SampleID' in categories:
        raise ValueError("Cannot use SampleID as a category because it is a "
                         "unique identifier for each sample, and thus does "
                         "not create groups of samples (nor can it be used as "
                         "a numeric category in Moran's I or BIO-ENV "
                         "analyses). Please choose a different metadata "
                         "column to perform statistical tests on.")

    dm = DistanceMatrix.read(dm_fp)

    if method in ('anosim', 'permanova', 'bioenv'):
        with open(map_fp, 'U') as map_f:
            md_dict = parse_mapping_file_to_dict(map_f)[0]
        df = pd.DataFrame.from_dict(md_dict, orient='index')

        out_fp = join(out_dir, '%s_results.txt' % method)

        if method in ('anosim', 'permanova'):
            if method == 'anosim':
                method_cls = ANOSIM
            elif method == 'permanova':
                method_cls = PERMANOVA

            method_inst = method_cls(dm, df, column=categories[0])
            results = method_inst(num_perms)

            with open(out_fp, 'w') as out_f:
                out_f.write(results.summary())
        elif method == 'bioenv':
            results = bioenv(dm, df, columns=categories)
            results.to_csv(out_fp, sep='\t')
    else:
        # Remove any samples from the mapping file that aren't in the distance
        # matrix (important for validation checks). Use strict=True so that an
        # error is raised if the distance matrix contains any samples that
        # aren't in the mapping file.
        with open(map_fp, 'U') as map_f:
            md_map = MetadataMap.parseMetadataMap(map_f)
        md_map.filterSamples(dm.ids, strict=True)

        # These methods are run in R. Input validation must be done here before
        # running the R commands.
        if method in ['adonis', 'morans_i', 'mrpp', 'permdisp', 'dbrda']:
            # Check to make sure all categories passed in are in mapping file
            # and are not all the same value.
            for category in categories:
                if not category in md_map.CategoryNames:
                    raise ValueError("Category '%s' not found in mapping file "
                                     "columns." % category)

                if md_map.hasSingleCategoryValue(category):
                    raise ValueError("All values in category '%s' are the "
                                     "same. The statistical method '%s' "
                                     "cannot operate on a category that "
                                     "creates only a single group of samples "
                                     "(e.g. there are no 'between' distances "
                                     "because there is only a single group)."
                                     % (category, method))

            # Build the command arguments string.
            command_args = ['-d %s -m %s -c %s -o %s'
                            % (dm_fp, map_fp, categories[0], out_dir)]

            if method == 'morans_i':
                # Moran's I requires only numeric categories.
                for category in categories:
                    if not md_map.isNumericCategory(category):
                        raise TypeError("The category '%s' is not numeric. "
                                        "Not all values could be converted to "
                                        "numbers." % category)
            else:
                # The rest require groups of samples, so the category values
                # cannot all be unique.
                for category in categories:
                    if md_map.hasUniqueCategoryValues(category):
                        raise ValueError("All values in category '%s' are "
                                         "unique. This statistical method "
                                         "cannot operate on a category with "
                                         "unique values (e.g. there are no "
                                         "'within' distances because each "
                                         "group of samples contains only a "
                                         "single sample)." % category)

                # Only Moran's I doesn't accept a number of permutations.
                if num_perms < 0:
                    raise ValueError("The number of permutations must be "
                                     "greater than or equal to zero.")

                command_args[0] += ' -n %d' % num_perms

            rex = RExecutor(TmpDir=get_qiime_temp_dir())
            rex(command_args, '%s.r' % method, output_dir=out_dir)
        else:
            raise ValueError("Unrecognized method '%s'. Valid methods: %r"
                             % (method, methods))
示例#13
0
    def setUp(self):
        """Define some sample data that will be used by the tests."""
        # The prefix to use for temporary files. This prefix may be added to,
        # but all temp dirs and files created by the tests will have this
        # prefix at a minimum.
        self.prefix = 'my_microbes_tests_'

        self.start_dir = getcwd()
        self.dirs_to_remove = []
        self.files_to_remove = []

        self.tmp_dir = get_qiime_temp_dir()
        if not exists(self.tmp_dir):
            makedirs(self.tmp_dir)
            # If test creates the temp dir, also remove it.
            self.dirs_to_remove.append(self.tmp_dir)

        # Set up temporary input and output directories.
        self.input_dir = mkdtemp(dir=self.tmp_dir,
                                  prefix='%sinput_dir_' % self.prefix)
        self.dirs_to_remove.append(self.input_dir)

        self.output_dir = mkdtemp(dir=self.tmp_dir,
                                  prefix='%soutput_dir_' % self.prefix)
        self.dirs_to_remove.append(self.output_dir)

        # Data that will be used by the tests.
        self.metadata_map = MetadataMap.parseMetadataMap(
                mapping_str.split('\n'))
        self.mapping_data, self.mapping_header = parse_mapping_file(
                mapping_str.split('\n'))[:2]

        self.mapping_fp = join(self.input_dir, 'map.txt')
        mapping_f = open(self.mapping_fp, 'w')
        mapping_f.write(mapping_str)
        mapping_f.close()
        self.files_to_remove.append(self.mapping_fp)

        self.personal_metadata_map = MetadataMap.parseMetadataMap(
                personal_mapping_str.split('\n'))
        self.personal_mapping_data = parse_mapping_file(
                personal_mapping_str.split('\n'))[0]

        self.rarefaction_lines = collated_alpha_div_str.split('\n')
        self.na_rarefaction_lines = collated_alpha_div_na_str.split('\n')

        self.rarefaction_dir = join(self.input_dir, 'collated_adiv')
        create_dir(self.rarefaction_dir)
        self.rarefaction_fp = join(self.rarefaction_dir, 'PD_whole_tree.txt')
        rarefaction_f = open(self.rarefaction_fp, 'w')
        rarefaction_f.write(collated_alpha_div_str)
        rarefaction_f.close()
        self.files_to_remove.append(self.rarefaction_fp)

        self.coord_fp = join(self.input_dir, 'coord.txt')
        coord_f = open(self.coord_fp, 'w')
        coord_f.write(coord_str)
        coord_f.close()
        self.files_to_remove.append(self.coord_fp)

        self.otu_table_fp = join(self.input_dir, 'otu_table.biom')
        otu_table_f = open(self.otu_table_fp, 'w')
        otu_table_f.write(otu_table_str)
        otu_table_f.close()
        self.files_to_remove.append(self.otu_table_fp)

        self.prefs_fp = join(self.input_dir, 'prefs.txt')
        prefs_f = open(self.prefs_fp, 'w')
        prefs_f.write(prefs_str)
        prefs_f.close()
        self.files_to_remove.append(self.prefs_fp)

        self.recipients = ["# a comment", " ", " foo1\[email protected]  ",
                            "foo2\t [email protected],  [email protected],[email protected] "]

        self.email_settings = ["# A comment", "# Another comment",
                "smtp_server\tsome.smtp.server", "smtp_port\t42",
                "sender\[email protected]", "password\t424242!"]
def run_core_diversity_analyses(
    biom_fp,
    mapping_fp,
    sampling_depth,
    output_dir,
    qiime_config,
    command_handler=call_commands_serially,
    tree_fp=None,
    params=None,
    categories=None,
    arare_min_rare_depth=10,
    arare_num_steps=10,
    parallel=False,
    suppress_taxa_summary=False,
    suppress_beta_diversity=False,
    suppress_alpha_diversity=False,
    suppress_otu_category_significance=False,
    status_update_callback=print_to_stdout):
    """
    """
    if categories != None:
        # Validate categories provided by the users
        mapping_data, mapping_comments = \
         parse_mapping_file_to_dict(open(mapping_fp,'U'))
        metadata_map = MetadataMap(mapping_data, mapping_comments)
        for c in categories:
            if c not in metadata_map.CategoryNames:
                raise ValueError, ("Category '%s' is not a column header "
                 "in your mapping file. "
                 "Categories are case and white space sensitive. Valid "
                 "choices are: (%s)" % (c,', '.join(metadata_map.CategoryNames)))
            if metadata_map.hasSingleCategoryValue(c):
                raise ValueError, ("Category '%s' contains only one value. "
                 "Categories analyzed here require at least two values." % c)
            
    else:
        categories= []
    
    # prep some variables
    if params == None:
        params = parse_qiime_parameters([])
        
    create_dir(output_dir)
    index_fp = '%s/index.html' % output_dir
    index_links = []
    commands = []
    
    # begin logging
    log_fp = generate_log_fp(output_dir)
    index_links.append(('Master run log',log_fp,_index_headers['run_summary']))
    logger = WorkflowLogger(log_fp,
                            params=params,
                            qiime_config=qiime_config)
    input_fps = [biom_fp,mapping_fp]
    if tree_fp != None:
        input_fps.append(tree_fp)
    log_input_md5s(logger,input_fps)

    # run print_biom_table_summary.py on input BIOM table
    try:
        params_str = get_params_str(params['print_biom_table_summary'])
    except KeyError:
        params_str = ''
    biom_table_stats_output_fp = '%s/biom_table_summary.txt' % output_dir
    print_biom_table_summary_cmd = \
     "print_biom_table_summary.py -i %s -o %s --suppress_md5 %s" % \
     (biom_fp, biom_table_stats_output_fp,params_str)
    index_links.append(('BIOM table statistics',
                        biom_table_stats_output_fp,
                        _index_headers['run_summary']))
    commands.append([('Generate BIOM table summary',
                      print_biom_table_summary_cmd)])
    
    # filter samples with fewer observations than the requested sampling_depth. 
    # since these get filtered for some analyses (eg beta diversity after
    # even sampling) it's useful to filter them here so they're filtered 
    # from all analyses.
    filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth)
    filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" %\
     (biom_fp,filtered_biom_fp,sampling_depth)
    commands.append([('Filter low sequence count samples from table (minimum sequence count: %d)' % sampling_depth,
                      filter_samples_cmd)])
    biom_fp = filtered_biom_fp
    
    # run initial commands and reset the command list
    command_handler(commands, 
                    status_update_callback, 
                    logger,
                    close_logger_on_success=False)
    commands = []
    
    if not suppress_beta_diversity:
        bdiv_even_output_dir = '%s/bdiv_even%d/' % (output_dir,sampling_depth)
        even_dm_fps = run_beta_diversity_through_plots(
         otu_table_fp=biom_fp, 
         mapping_fp=mapping_fp,
         output_dir=bdiv_even_output_dir,
         command_handler=command_handler,
         params=params,
         qiime_config=qiime_config,
         sampling_depth=sampling_depth,
         # force suppression of distance histograms - boxplots work better
         # in this context, and are created below.
         histogram_categories=[],
         tree_fp=tree_fp,
         parallel=parallel,
         logger=logger,
         suppress_md5=True,
         status_update_callback=status_update_callback)
    
        for bdiv_metric, dm_fp in even_dm_fps:
            for category in categories:
                boxplots_output_dir = '%s/%s_boxplots/' % (bdiv_even_output_dir,bdiv_metric)
                try:
                    params_str = get_params_str(params['make_distance_boxplots'])
                except KeyError:
                    params_str = ''
                boxplots_cmd = \
                 'make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s' %\
                 (dm_fp, category, boxplots_output_dir, mapping_fp, params_str)
                commands.append([('Boxplots (%s)' % category,
                                  boxplots_cmd)])
                index_links.append(('Distance boxplots (%s)' % bdiv_metric,
                                    '%s/%s_Distances.pdf' % \
                                     (boxplots_output_dir,category),
                                    _index_headers['beta_diversity_even'] % sampling_depth))
                index_links.append(('Distance boxplots statistics (%s)' % bdiv_metric,
                                    '%s/%s_Stats.txt' % \
                                     (boxplots_output_dir,category),
                                    _index_headers['beta_diversity_even'] % sampling_depth))
            
            index_links.append(('3D plot (%s, continuous coloring)' % bdiv_metric,
                                '%s/%s_3d_continuous/%s_pc_3D_PCoA_plots.html' % \
                                 (bdiv_even_output_dir,bdiv_metric,bdiv_metric),
                                _index_headers['beta_diversity_even'] % sampling_depth))
            index_links.append(('3D plot (%s, discrete coloring)' % bdiv_metric,
                                '%s/%s_3d_discrete/%s_pc_3D_PCoA_plots.html' % \
                                 (bdiv_even_output_dir,bdiv_metric,bdiv_metric),
                                _index_headers['beta_diversity_even'] % sampling_depth))
            index_links.append(('2D plot (%s, continuous coloring)' % bdiv_metric,
                                '%s/%s_2d_continuous/%s_pc_2D_PCoA_plots.html' % \
                                 (bdiv_even_output_dir,bdiv_metric,bdiv_metric),
                                _index_headers['beta_diversity_even'] % sampling_depth))
            index_links.append(('2D plot (%s, discrete coloring)' % bdiv_metric,
                                '%s/%s_2d_discrete/%s_pc_2D_PCoA_plots.html' % \
                                 (bdiv_even_output_dir,bdiv_metric,bdiv_metric),
                                _index_headers['beta_diversity_even'] % sampling_depth))
            index_links.append(('Distance matrix (%s)' % bdiv_metric,
                                '%s/%s_dm.txt' % \
                                 (bdiv_even_output_dir,bdiv_metric),
                                _index_headers['beta_diversity_even'] % sampling_depth))
            index_links.append(('Principal coordinate matrix (%s)' % bdiv_metric,
                                '%s/%s_pc.txt' % \
                                 (bdiv_even_output_dir,bdiv_metric),
                                _index_headers['beta_diversity_even'] % sampling_depth))
    
    if not suppress_alpha_diversity:
        ## Alpha rarefaction workflow
        arare_full_output_dir = '%s/arare_max%d/' % (output_dir,sampling_depth)
        run_alpha_rarefaction(
         otu_table_fp=biom_fp,
         mapping_fp=mapping_fp,
         output_dir=arare_full_output_dir,
         command_handler=command_handler,
         params=params,
         qiime_config=qiime_config,
         tree_fp=tree_fp,
         num_steps=arare_num_steps,
         parallel=parallel,
         logger=logger,
         min_rare_depth=arare_min_rare_depth,
         max_rare_depth=sampling_depth,
         suppress_md5=True,
         status_update_callback=status_update_callback)
    
        index_links.append(('Alpha rarefaction plots',
                            '%s/alpha_rarefaction_plots/rarefaction_plots.html'\
                              % arare_full_output_dir,
                            _index_headers['alpha_diversity']))
                        
        collated_alpha_diversity_fps = \
         glob('%s/alpha_div_collated/*txt' % arare_full_output_dir)
        try:
            params_str = get_params_str(params['compare_alpha_diversity'])
        except KeyError:
            params_str = ''
        for category in categories:
            for collated_alpha_diversity_fp in collated_alpha_diversity_fps:
                alpha_metric = splitext(split(collated_alpha_diversity_fp)[1])[0]
                alpha_comparison_output_fp = '%s/%s_%s.txt' % \
                 (arare_full_output_dir,category,alpha_metric)
                compare_alpha_cmd = \
                 'compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s' %\
                 (collated_alpha_diversity_fp, mapping_fp, category, 
                  alpha_comparison_output_fp, params_str)
                commands.append([('Compare alpha diversity (%s, %s)' %\
                                   (category,alpha_metric),
                                  compare_alpha_cmd)])
                index_links.append(
                 ('Alpha diversity statistics (%s, %s)' % (category,alpha_metric),
                  alpha_comparison_output_fp,
                  _index_headers['alpha_diversity']))
    
    if not suppress_taxa_summary:
        taxa_plots_output_dir = '%s/taxa_plots/' % output_dir
        run_summarize_taxa_through_plots(
         otu_table_fp=biom_fp,
         mapping_fp=mapping_fp,
         output_dir=taxa_plots_output_dir,
         mapping_cat=None, 
         sort=True,
         command_handler=command_handler,
         params=params,
         qiime_config=qiime_config,
         logger=logger,
         suppress_md5=True,
         status_update_callback=status_update_callback)
    

        index_links.append(('Taxa summary bar plots',
                            '%s/taxa_summary_plots/bar_charts.html'\
                              % taxa_plots_output_dir,
                            _index_headers['taxa_summary']))
        index_links.append(('Taxa summary area plots',
                            '%s/taxa_summary_plots/area_charts.html'\
                              % taxa_plots_output_dir,
                            _index_headers['taxa_summary']))
        for category in categories:
            taxa_plots_output_dir = '%s/taxa_plots_%s/' % (output_dir,category)
            run_summarize_taxa_through_plots(
             otu_table_fp=biom_fp,
             mapping_fp=mapping_fp,
             output_dir=taxa_plots_output_dir,
             mapping_cat=category, 
             sort=True,
             command_handler=command_handler,
             params=params,
             qiime_config=qiime_config,
             logger=logger,
             suppress_md5=True,
             status_update_callback=status_update_callback)

            index_links.append(('Taxa summary bar plots',
                                '%s/taxa_summary_plots/bar_charts.html'\
                                  % taxa_plots_output_dir,
                                _index_headers['taxa_summary_categorical'] % category))
            index_links.append(('Taxa summary area plots',
                                '%s/taxa_summary_plots/area_charts.html'\
                                  % taxa_plots_output_dir,
                                _index_headers['taxa_summary_categorical'] % category))
    
    if not suppress_otu_category_significance:
        # OTU category significance
        for category in categories:
            category_signifance_fp = \
             '%s/category_significance_%s.txt' % (output_dir, category)
            try:
                params_str = get_params_str(params['otu_category_significance'])
            except KeyError:
                params_str = ''
            # Build the OTU cateogry significance command
            category_significance_cmd = \
             'otu_category_significance.py -i %s -m %s -c %s -o %s %s' %\
             (biom_fp, mapping_fp, category, 
              category_signifance_fp, params_str)
            commands.append([('OTU category significance (%s)' % category, 
                              category_significance_cmd)])
                          
            index_links.append(('Category significance (%s)' % category,
                        category_signifance_fp,
                        _index_headers['otu_category_sig']))
    
    commands.append([('Compress the filtered BIOM table','gzip %s' % filtered_biom_fp)])
    index_links.append(('Filtered BIOM table (minimum sequence count: %d)' % sampling_depth,
                        '%s.gz' % filtered_biom_fp,
                        _index_headers['run_summary']))
    
    command_handler(commands, status_update_callback, logger)
    generate_index_page(index_links,index_fp)
示例#15
0
def _color_field_states(map_f, samp_ids, field, field_states, color_by_field):
    """Colors one field by another.

    Returns a list of matplotlib-compatible colors, one for each of the input
    field_states. Also returns a dictionary mapping color_by_field states to
    colors (useful for building a legend, for example).

    If there are not enough colors available (they are drawn from
    qiime.colors.data_colors), an error will be raised as the color mapping
    (and legend) will be ambiguous.

    A one-to-one mapping must exist between each field_state and its
    corresponding color_by field state (otherwise it is unclear which
    corresponding color_by field state should be used to color it by). An error
    will be raised if this one-to-one mapping does not exist.

    Arguments:
        map_f - the mapping file (file-like object)
        samp_ids - a list of sample IDs to consider in the mapping file. Only
            these sample IDs will be used when coloring field states
        field - the field in the mapping file to color
        field_states - the field states in field to color
        color_by_field - the field in the mapping file to color field_states by
    """
    colors = []
    color_pool = [
        matplotlib_rgb_color(data_colors[color].toRGB())
        for color in data_color_order
    ]
    metadata_map = MetadataMap.parseMetadataMap(map_f)

    for field_to_check in field, color_by_field:
        if field_to_check not in metadata_map.CategoryNames:
            raise ValueError("The field '%s' is not in the metadata mapping "
                             "file's column headers." % field_to_check)

    all_field_states = metadata_map.getCategoryValues(samp_ids, field)
    all_color_by_states = metadata_map.getCategoryValues(
        samp_ids, color_by_field)

    if len(set(field_states) - set(all_field_states)) != 0:
        raise ValueError("Encountered unrecognizable field state(s) in %r "
                         "for field '%s'." % (field_states, field))

    # Build mapping from one field to the other.
    field_mapping = defaultdict(list)
    for field_state, color_by_state in zip(all_field_states,
                                           all_color_by_states):
        if field_state in field_states:
            field_mapping[field_state].append(color_by_state)

    # For each of the specified input field states, find its corresponding
    # "color by" field state and give it a color if it hasn't been assigned one
    # yet. Make sure we have enough colors and there is a one-to-one mapping.
    color_mapping = {}
    for field_state in field_states:
        color_by_states = set(field_mapping[field_state])

        if len(color_by_states) != 1:
            raise ValueError("The field '%s' to color by does not have a "
                             "one-to-one mapping with field '%s'. Coloring "
                             "would be ambiguous." % (color_by_field, field))

        color_by_state = list(color_by_states)[0]
        if color_by_state not in color_mapping:
            if len(color_pool) > 0:
                color_mapping[color_by_state] = color_pool.pop(0)
            else:
                raise ValueError("There are not enough available QIIME colors "
                                 "to color each of the field states in field "
                                 "'%s'. Coloring would be ambiguous." %
                                 color_by_field)

        colors.append(color_mapping[color_by_state])

    return colors, color_mapping
示例#16
0
def compare_categories(dm_fp, map_fp, method, categories, num_perms, out_dir):
    """Runs the specified statistical method using the category of interest.

    This method does not return anything; all output is written to results
    files in out_dir.

    Arguments:
        dm_fp - filepath to the input distance matrix
        map_fp - filepath to the input metadata mapping file
        categories - list of categories in the metadata mapping file to
            consider in the statistical test. Multiple categories will only be
            considered if method is 'bioenv', otherwise only the first category
            will be considered
        num_perms - the number of permutations to use when calculating the
            p-value. If method is 'bioenv' or 'morans_i', this parameter will
            be ignored as they are not permutation-based methods
        out_dir - path to the output directory where results files will be
            written. It is assumed that this directory already exists and we
            have write permissions to it
    """
    # Make sure we were passed a list of categories, not a single string.
    if not isinstance(categories, ListType):
        raise TypeError("The supplied categories must be a list of "
                        "strings.")

    # Special case: we do not allow SampleID as it is not a category, neither
    # in data structure representation nor in terms of a statistical test (no
    # groups are formed since all entries are unique IDs).
    if 'SampleID' in categories:
        raise ValueError("Cannot use SampleID as a category because it is a "
                         "unique identifier for each sample, and thus does "
                         "not create groups of samples (nor can it be used as "
                         "a numeric category in Moran's I or BIO-ENV "
                         "analyses). Please choose a different metadata "
                         "column to perform statistical tests on.")

    dm = DistanceMatrix.read(dm_fp)

    if method in ('anosim', 'permanova', 'bioenv'):
        with open(map_fp, 'U') as map_f:
            md_dict = parse_mapping_file_to_dict(map_f)[0]
        df = pd.DataFrame.from_dict(md_dict, orient='index')

        out_fp = join(out_dir, '%s_results.txt' % method)

        if method in ('anosim', 'permanova'):
            if method == 'anosim':
                method_fn = anosim
            elif method == 'permanova':
                method_fn = permanova

            results = method_fn(dm,
                                df,
                                column=categories[0],
                                permutations=num_perms)
        elif method == 'bioenv':
            results = bioenv(dm, df, columns=categories)

        results.to_csv(out_fp, sep='\t')
    else:
        # Remove any samples from the mapping file that aren't in the distance
        # matrix (important for validation checks). Use strict=True so that an
        # error is raised if the distance matrix contains any samples that
        # aren't in the mapping file.
        with open(map_fp, 'U') as map_f:
            md_map = MetadataMap.parseMetadataMap(map_f)
        md_map.filterSamples(dm.ids, strict=True)

        # These methods are run in R. Input validation must be done here before
        # running the R commands.
        if method in ['adonis', 'morans_i', 'mrpp', 'permdisp', 'dbrda']:
            # Check to make sure all categories passed in are in mapping file
            # and are not all the same value.
            for category in categories:
                if not category in md_map.CategoryNames:
                    raise ValueError("Category '%s' not found in mapping file "
                                     "columns." % category)

                if md_map.hasSingleCategoryValue(category):
                    raise ValueError("All values in category '%s' are the "
                                     "same. The statistical method '%s' "
                                     "cannot operate on a category that "
                                     "creates only a single group of samples "
                                     "(e.g. there are no 'between' distances "
                                     "because there is only a single group)." %
                                     (category, method))

            # Build the command arguments string.
            command_args = [
                '-d %s -m %s -c %s -o %s' %
                (dm_fp, map_fp, categories[0], out_dir)
            ]

            if method == 'morans_i':
                # Moran's I requires only numeric categories.
                for category in categories:
                    if not md_map.isNumericCategory(category):
                        raise TypeError("The category '%s' is not numeric. "
                                        "Not all values could be converted to "
                                        "numbers." % category)
            else:
                # The rest require groups of samples, so the category values
                # cannot all be unique.
                for category in categories:
                    if (md_map.hasUniqueCategoryValues(category)
                            and not (method == 'adonis'
                                     and md_map.isNumericCategory(category))):
                        raise ValueError("All values in category '%s' are "
                                         "unique. This statistical method "
                                         "cannot operate on a category with "
                                         "unique values (e.g. there are no "
                                         "'within' distances because each "
                                         "group of samples contains only a "
                                         "single sample)." % category)

                # Only Moran's I doesn't accept a number of permutations.
                if num_perms < 0:
                    raise ValueError("The number of permutations must be "
                                     "greater than or equal to zero.")

                command_args[0] += ' -n %d' % num_perms

            rex = RExecutor(TmpDir=get_qiime_temp_dir())
            rex(command_args, '%s.r' % method)
        else:
            raise ValueError("Unrecognized method '%s'. Valid methods: %r" %
                             (method, methods))
示例#17
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    if opts.show_indices:
        for idx_key in sorted(known_indices):
            idx = known_indices[idx_key]
            print "%s: %s, %s" % (idx_key, idx['name'], idx['source'])
            print '\t', 'increased:'
            print '\n'.join(['\t\t%s' % t for t in idx['increased']])
            print '\t', 'decreased:'
            print '\n'.join(['\t\t%s' % t for t in idx['decreased']])
        exit(0)

    if opts.index is not None and known_indices.get(opts.index) is None:
        option_parser.error("%s is not a known index. Known indices are: %s"
                            % (opts.index, ','.join(known_indices.keys())))

    if opts.index is not None and (opts.increased or opts.decreased):
        option_parser.error("Cannot specify both an existing and custom index")

    if opts.index is None and opts.increased is None and \
            opts.decreased is None:
        option_parser.error("Must specify an existing or custom index")

    if opts.increased and opts.decreased is None:
        option_parser.error("Must specify decreased taxa")

    if opts.decreased and opts.increased is None:
        option_parser.error("Must specify increased taxa")

    if opts.index is not None:
        name = opts.name if opts.name is not None else opts.index
        increased = known_indices[opts.index]['increased']
        decreased = known_indices[opts.index]['decreased']
    else:
        name = opts.name if opts.name is not None else 'index'
        increased = set(opts.increased.split(','))
        decreased = set(opts.decreased.split(','))

    if opts.input is None:
        option_parser.error("Input not specified")

    if opts.output is None:
        option_parser.error("Output not specified")

    table = load_table(opts.input)

    if opts.mapping_file:
        mapping_file = open(opts.mapping_file, 'U')
        output_fp = TemporaryFile()
    else:
        mapping_file = None
        output_fp = open(opts.output, 'w')

    output_fp.write("#SampleID\t%s\n" % name)
    for id_, value in compute_index(table, increased, decreased, opts.key):
        output_fp.write("%s\t%f\n" % (id_, value))

    if opts.mapping_file:
        output_fp.seek(0)
        mapping_data = MetadataMap.mergeMappingFiles([output_fp, mapping_file],
                                                     no_data_value=nan)
        with open(opts.output, 'w') as fp:
            fp.write(str(mapping_data))

    output_fp.close()
示例#18
0
文件: util.py 项目: teravest/emperor
def preprocess_mapping_file(data,
                            headers,
                            columns,
                            unique=False,
                            single=False,
                            clones=0):
    """Process a mapping file to expand the data or remove unuseful fields

    Inputs:
    data: mapping file data
    headers: mapping file headers
    columns: list of headers to keep, if one of these headers includes two
    ampersands, this function will create a new column by merging the delimited
    columns.
    unique: keep columns where all values are unique
    single: keep columns where all values are the same
    clones: number of times to replicate the metadata

    Outputs:
    data: processed mapping file data
    headers: processed mapping file headers
    """

    # The sample ID must always be there, else it's meaningless data
    if 'SampleID' != columns[0]:
        columns = ['SampleID'] + columns

    # process concatenated columns if needed
    merge = []
    for column in columns:
        if '&&' in column:
            merge.append(column)
    # each element needs several columns to be merged
    for new_column in merge:
        indices = [
            headers.index(header_name)
            for header_name in new_column.split('&&')
        ]

        # join all the fields of the metadata that are listed in indices
        for line in data:
            line.append(''.join([line[index] for index in indices]))
        headers.append(new_column)

    # remove all unique or singled valued columns
    if unique or single:
        columns_to_remove = []
        metadata = MetadataMap(mapping_file_to_dict(data, headers), [])

        # find columns that have values that are all unique
        if unique == True:
            columns_to_remove += [
                column_name for column_name in headers[1::]
                if metadata.hasUniqueCategoryValues(column_name)
            ]

        # remove categories where there is only one value
        if single == True:
            columns_to_remove += [
                column_name for column_name in headers[1::]
                if metadata.hasSingleCategoryValue(column_name)
            ]
        columns_to_remove = list(set(columns_to_remove))

        # remove the single or unique columns
        data, headers = keep_columns_from_mapping_file(data,
                                                       headers,
                                                       columns_to_remove,
                                                       negate=True)

    # remove anything not specified in the input
    data, headers = keep_columns_from_mapping_file(data, headers, columns)

    # sanitize the mapping file data and headers
    data, headers = sanitize_mapping_file(data, headers)

    # clones mean: replicate the metadata retagging the sample ids with a suffix
    if clones:
        out_data = []
        for index in range(0, clones):
            out_data.extend([[element[0] + '_%d' % index] + element[1::]
                             for element in data])
        data = out_data

    return data, headers
示例#19
0
文件: filter.py 项目: qinjunjie/qiime
def sample_ids_from_category_state_coverage(
    mapping_f, coverage_category, subject_category, min_num_states=None, required_states=None, considered_states=None
):
    """Filter sample IDs based on subject's coverage of a category.

    Given a category that groups samples by subject (subject_category), samples
    are filtered by how well a subject covers (i.e. has at least one sample
    for) the category states in coverage_category.

    Two filtering criteria are provided (min_num_states and required_states). At
    least one must be provided. If both are provided, the subject must meet
    both criteria to pass the filter (i.e. providing both filters is an AND,
    not an OR, operation).

    A common use case is to provide a 'time' category for coverage_category and
    an 'individual' category for subject_category in order to filter out
    individuals from a study that do not have samples for some minimum number
    of timepoints (min_num_states) and that do not have samples for certain
    timepoints (required_states). For example, this could be the first and last
    timepoints in the study.

    Returns a list of sample IDs to keep, the number of subjects that were
    kept, and the number of unique category states in coverage_category that
    were kept. The list of sample IDs is not guaranteed to be in any specific
    order relative to the order of sample IDs or subjects in the mapping file.

    Arguments:
        mapping_f - metadata mapping file (file-like object)
        coverage_category - category to test subjects' coverage (string)
        subject_category - category to group samples by subject (string)
        min_num_states - minimum number of category states in coverage_category
            that a subject must cover (i.e. have at least one sample for) to be
            included in results (integer)
        required_states - category states in coverage_category that must be
            covered by a subject's samples in order to be included in results
            (list of strings)
        considered_states - category states that are counted toward the 
            min_num_states (list of strings)
    """
    metadata_map = MetadataMap.parseMetadataMap(mapping_f)

    # Make sure out input looks sane.
    if coverage_category == "SampleID" or subject_category == "SampleID":
        raise ValueError(
            "The 'SampleID' category is not suitable for use in "
            "this function. Please choose a different category "
            "from the metadata mapping file."
        )

    if coverage_category not in metadata_map.CategoryNames:
        raise ValueError("The coverage category '%s' is not in the metadata " "mapping file." % coverage_category)

    if subject_category not in metadata_map.CategoryNames:
        raise ValueError("The subject category '%s' is not in the metadata " "mapping file." % subject_category)

    if required_states is not None:
        # required_states must be in coverage_category's states in the mapping
        # file.
        required_states = set(map(str, required_states))
        valid_coverage_states = set(metadata_map.getCategoryValues(metadata_map.SampleIds, coverage_category))
        invalid_coverage_states = required_states - valid_coverage_states

        if invalid_coverage_states:
            raise ValueError(
                "The category state(s) '%s' are not in the '%s' "
                "category in the metadata mapping file." % (", ".join(invalid_coverage_states), coverage_category)
            )

    if considered_states is not None:
        # considered_states is not as restrictive as required_states - we don't
        # require that these are present, so it's OK if some of the states
        # listed here don't actually show up in the mapping file (allowing
        # the user to pass something like range(100) to consider only states
        # that fall in some range)
        considered_states = set(map(str, considered_states))
        # define a function to determine if a state should be considered
        consider_state = lambda s: s in considered_states
    else:
        # define a dummy function to consider all states (the default
        # if the user does not provide a list of considered_states)
        consider_state = lambda s: True

    if min_num_states is None and required_states is None:
        raise ValueError(
            "You must specify either the minimum number of "
            "category states the subject must have samples for "
            "(min_num_states), or the minimal category states "
            "the subject must have samples for (required_states), "
            "or both. Supplying neither filtering criteria is "
            "not supported."
        )

    # Build mapping from subject to sample IDs.
    subjects = defaultdict(list)
    for samp_id in metadata_map.SampleIds:
        subject = metadata_map.getCategoryValue(samp_id, subject_category)
        subjects[subject].append(samp_id)

    # Perform filtering.
    samp_ids_to_keep = []
    num_subjects_kept = 0
    states_kept = []
    for subject, samp_ids in subjects.items():
        subject_covered_states = set(metadata_map.getCategoryValues(samp_ids, coverage_category))

        # Short-circuit evaluation of ANDing filters.
        keep_subject = True
        if min_num_states is not None:
            # note: when summing a list of boolean values, True == 1 and False == 0
            if sum([consider_state(s) for s in subject_covered_states]) < min_num_states:
                keep_subject = False
        if keep_subject and required_states is not None:
            if len(subject_covered_states & required_states) != len(required_states):
                keep_subject = False

        if keep_subject:
            samp_ids_to_keep.extend(samp_ids)
            states_kept.extend(subject_covered_states)
            num_subjects_kept += 1

    return samp_ids_to_keep, num_subjects_kept, len(set(states_kept))
示例#20
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    try:
        if not path.exists(opts.output_dir):
            create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")

    # Parse the mapping file and distance matrix.
    md_map = MetadataMap.parseMetadataMap(open(opts.mapping_file,'U'))
    dm = DistanceMatrix.parseDistanceMatrix(open(opts.input_dm,'U'))

    # Separate all categories into a list, then grab the first category.
    categories = opts.categories.split(',')

    # Cursory check to make sure all categories passed in are in mapping file.
    maps = parse_mapping_file(open(opts.mapping_file,'U').readlines())
    for category in categories:
        if not category in maps[1][1:]:
            option_parser.error("Category '%s' not found in mapping file "
                                "columns:" % category)

    # Make sure the input distance matrix is symmetric and hollow. Must check
    # here before allowing R to use it, as R will silently ignore the diagonal
    # and upper triangle of the distance matrix.
    if not dm.is_symmetric_and_hollow():
        option_parser.error("The distance matrix must be symmetric and "
                            "hollow.")

    # Figure out which method we need to run.
    if opts.method == 'adonis':
        command_args = ["-d " + opts.input_dm + " -m " + opts.mapping_file + \
            " -c " + categories[0] + " -o " + opts.output_dir + " -n " + \
            str(opts.num_permutations)]
        rex = RExecutor()
        rex(command_args, "adonis.r", output_dir=opts.output_dir)
    elif opts.method == 'anosim':
        anosim = Anosim(md_map, dm, categories[0])
        anosim_results = anosim(opts.num_permutations)

        output_file = open(opts.output_dir + "/" + opts.method + \
            "_results.txt", "w+")
        output_file.write("Method Name\tR-value\tP-value")
        output_file.write("\n")
        output_file.write(anosim_results["method_name"]+"\t"+\
            str(anosim_results["r_value"])+"\t"+\
            str(anosim_results["p_value"])+"\t")
        output_file.write("\n")
        output_file.close()
    elif opts.method == 'best':
        bioenv = BioEnv(dm, md_map, categories)
        bioenv_results = bioenv()

        output_file = open(opts.output_dir+"/best_results.txt", 'w+')
        output_file.write("Method Name:\tNum_Vars:\t")
        output_file.write("\n")
        output_file.write(bioenv_results["method_name"]+"\t"+\
            str(bioenv_results["num_vars"]) + "\t")
        output_file.write("\n")
        output_file.write("Variables:\t")
        output_file.write("\n")
        for variable in bioenv_results["vars"]:
            output_file.write(str(variable) + "\t")
        output_file.write("\n")
        output_file.write("RHO_Values:\t")
        output_file.write("\n")
        for rho_val in bioenv_results["bioenv_rho_vals"]:
            output_file.write(str(rho_val) + "\t")
        output_file.write("\n")
        output_file.close()
    elif opts.method == 'morans_i':
        command_args = ["-i " + opts.input_dm + " -m " + opts.mapping_file + \
            " -c " + categories[0] + " -o " + opts.output_dir]
        rex = RExecutor()
        rex(command_args, "morans_i.r", output_dir=opts.output_dir)
    elif opts.method == 'mrpp':
        command_args = ["-d " + opts.input_dm + " -m " + opts.mapping_file + \
            " -c " + categories[0] + " -o " + opts.output_dir + \
            " -n " + str(opts.num_permutations)]
        rex = RExecutor()
        rex(command_args, "mrpp.r", output_dir=opts.output_dir)
    elif opts.method == 'permanova':
        permanova_plain = Permanova(md_map, dm, categories[0])
        permanova_results = permanova_plain(opts.num_permutations)

        output_file = open(opts.output_dir+"/permanova_results.txt", 'w+')
        output_file.write("Method Name\tF-value\tP-value")
        output_file.write("\n")
        output_file.write(permanova_results["method_name"]+"\t"+\
            str(permanova_results["f_value"]) + "\t" + \
            format_p_value_for_num_iters(permanova_results["p_value"], \
            opts.num_permutations)+"\t")
        output_file.write("\n")
        output_file.close()
    elif opts.method == 'permdisp':
        command_args = ["-d " + opts.input_dm + " -m " + opts.mapping_file + \
            " -c " + categories[0] + " -o " + opts.output_dir + " -n " + \
            str(opts.num_permutations)]
        rex = RExecutor()
        rex(command_args, "permdisp.r", output_dir=opts.output_dir)
    elif opts.method == 'dbrda':
        command_args = ["-i " + opts.input_dm + " -m " + opts.mapping_file + \
            " -c " + categories[0] + " -o " + opts.output_dir + " -n " + \
            str(opts.num_permutations)]
        rex = RExecutor()
        rex(command_args, "dbrda.r", output_dir=opts.output_dir)
def run_core_diversity_analyses(
    biom_fp,
    mapping_fp,
    sampling_depth,
    output_dir,
    qiime_config,
    command_handler=call_commands_serially,
    tree_fp=None,
    params=None,
    categories=None,
    arare_min_rare_depth=10,
    arare_num_steps=10,
    parallel=False,
    suppress_taxa_summary=False,
    suppress_beta_diversity=False,
    suppress_alpha_diversity=False,
    suppress_otu_category_significance=False,
    status_update_callback=print_to_stdout,
):
    """
    """
    if categories != None:
        # Validate categories provided by the users
        mapping_data, mapping_comments = parse_mapping_file_to_dict(open(mapping_fp, "U"))
        metadata_map = MetadataMap(mapping_data, mapping_comments)
        for c in categories:
            if c not in metadata_map.CategoryNames:
                raise ValueError, (
                    "Category '%s' is not a column header "
                    "in your mapping file. "
                    "Categories are case and white space sensitive. Valid "
                    "choices are: (%s)" % (c, ", ".join(metadata_map.CategoryNames))
                )
            if metadata_map.hasSingleCategoryValue(c):
                raise ValueError, (
                    "Category '%s' contains only one value. "
                    "Categories analyzed here require at least two values." % c
                )

    else:
        categories = []

    # prep some variables
    if params == None:
        params = parse_qiime_parameters([])

    create_dir(output_dir)
    index_fp = "%s/index.html" % output_dir
    index_links = []
    commands = []

    # begin logging
    log_fp = generate_log_fp(output_dir)
    index_links.append(("Master run log", log_fp, _index_headers["run_summary"]))
    logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config)
    input_fps = [biom_fp, mapping_fp]
    if tree_fp != None:
        input_fps.append(tree_fp)
    log_input_md5s(logger, input_fps)

    # run print_biom_table_summary.py on input BIOM table
    try:
        params_str = get_params_str(params["print_biom_table_summary"])
    except KeyError:
        params_str = ""
    biom_table_stats_output_fp = "%s/biom_table_summary.txt" % output_dir
    print_biom_table_summary_cmd = "print_biom_table_summary.py -i %s -o %s --suppress_md5 %s" % (
        biom_fp,
        biom_table_stats_output_fp,
        params_str,
    )
    index_links.append(("BIOM table statistics", biom_table_stats_output_fp, _index_headers["run_summary"]))
    commands.append([("Generate BIOM table summary", print_biom_table_summary_cmd)])

    # filter samples with fewer observations than the requested sampling_depth.
    # since these get filtered for some analyses (eg beta diversity after
    # even sampling) it's useful to filter them here so they're filtered
    # from all analyses.
    filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth)
    filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" % (
        biom_fp,
        filtered_biom_fp,
        sampling_depth,
    )
    commands.append(
        [
            (
                "Filter low sequence count samples from table (minimum sequence count: %d)" % sampling_depth,
                filter_samples_cmd,
            )
        ]
    )
    biom_fp = filtered_biom_fp

    # run initial commands and reset the command list
    command_handler(commands, status_update_callback, logger, close_logger_on_success=False)
    commands = []

    if not suppress_beta_diversity:
        bdiv_even_output_dir = "%s/bdiv_even%d/" % (output_dir, sampling_depth)
        even_dm_fps = run_beta_diversity_through_plots(
            otu_table_fp=biom_fp,
            mapping_fp=mapping_fp,
            output_dir=bdiv_even_output_dir,
            command_handler=command_handler,
            params=params,
            qiime_config=qiime_config,
            sampling_depth=sampling_depth,
            # force suppression of distance histograms - boxplots work better
            # in this context, and are created below.
            histogram_categories=[],
            tree_fp=tree_fp,
            parallel=parallel,
            logger=logger,
            suppress_md5=True,
            status_update_callback=status_update_callback,
        )

        for bdiv_metric, dm_fp in even_dm_fps:
            for category in categories:
                boxplots_output_dir = "%s/%s_boxplots/" % (bdiv_even_output_dir, bdiv_metric)
                try:
                    params_str = get_params_str(params["make_distance_boxplots"])
                except KeyError:
                    params_str = ""
                boxplots_cmd = "make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s" % (
                    dm_fp,
                    category,
                    boxplots_output_dir,
                    mapping_fp,
                    params_str,
                )
                commands.append([("Boxplots (%s)" % category, boxplots_cmd)])
                index_links.append(
                    (
                        "Distance boxplots (%s)" % bdiv_metric,
                        "%s/%s_Distances.pdf" % (boxplots_output_dir, category),
                        _index_headers["beta_diversity_even"] % sampling_depth,
                    )
                )
                index_links.append(
                    (
                        "Distance boxplots statistics (%s)" % bdiv_metric,
                        "%s/%s_Stats.txt" % (boxplots_output_dir, category),
                        _index_headers["beta_diversity_even"] % sampling_depth,
                    )
                )

            index_links.append(
                (
                    "3D plot (%s, continuous coloring)" % bdiv_metric,
                    "%s/%s_3d_continuous/%s_pc_3D_PCoA_plots.html" % (bdiv_even_output_dir, bdiv_metric, bdiv_metric),
                    _index_headers["beta_diversity_even"] % sampling_depth,
                )
            )
            index_links.append(
                (
                    "3D plot (%s, discrete coloring)" % bdiv_metric,
                    "%s/%s_3d_discrete/%s_pc_3D_PCoA_plots.html" % (bdiv_even_output_dir, bdiv_metric, bdiv_metric),
                    _index_headers["beta_diversity_even"] % sampling_depth,
                )
            )
            index_links.append(
                (
                    "2D plot (%s, continuous coloring)" % bdiv_metric,
                    "%s/%s_2d_continuous/%s_pc_2D_PCoA_plots.html" % (bdiv_even_output_dir, bdiv_metric, bdiv_metric),
                    _index_headers["beta_diversity_even"] % sampling_depth,
                )
            )
            index_links.append(
                (
                    "2D plot (%s, discrete coloring)" % bdiv_metric,
                    "%s/%s_2d_discrete/%s_pc_2D_PCoA_plots.html" % (bdiv_even_output_dir, bdiv_metric, bdiv_metric),
                    _index_headers["beta_diversity_even"] % sampling_depth,
                )
            )
            index_links.append(
                (
                    "Distance matrix (%s)" % bdiv_metric,
                    "%s/%s_dm.txt" % (bdiv_even_output_dir, bdiv_metric),
                    _index_headers["beta_diversity_even"] % sampling_depth,
                )
            )
            index_links.append(
                (
                    "Principal coordinate matrix (%s)" % bdiv_metric,
                    "%s/%s_pc.txt" % (bdiv_even_output_dir, bdiv_metric),
                    _index_headers["beta_diversity_even"] % sampling_depth,
                )
            )

    if not suppress_alpha_diversity:
        ## Alpha rarefaction workflow
        arare_full_output_dir = "%s/arare_max%d/" % (output_dir, sampling_depth)
        run_alpha_rarefaction(
            otu_table_fp=biom_fp,
            mapping_fp=mapping_fp,
            output_dir=arare_full_output_dir,
            command_handler=command_handler,
            params=params,
            qiime_config=qiime_config,
            tree_fp=tree_fp,
            num_steps=arare_num_steps,
            parallel=parallel,
            logger=logger,
            min_rare_depth=arare_min_rare_depth,
            max_rare_depth=sampling_depth,
            suppress_md5=True,
            status_update_callback=status_update_callback,
        )

        index_links.append(
            (
                "Alpha rarefaction plots",
                "%s/alpha_rarefaction_plots/rarefaction_plots.html" % arare_full_output_dir,
                _index_headers["alpha_diversity"],
            )
        )

        collated_alpha_diversity_fps = glob("%s/alpha_div_collated/*txt" % arare_full_output_dir)
        try:
            params_str = get_params_str(params["compare_alpha_diversity"])
        except KeyError:
            params_str = ""
        for category in categories:
            for collated_alpha_diversity_fp in collated_alpha_diversity_fps:
                alpha_metric = splitext(split(collated_alpha_diversity_fp)[1])[0]
                alpha_comparison_output_fp = "%s/%s_%s.txt" % (arare_full_output_dir, category, alpha_metric)
                compare_alpha_cmd = "compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s" % (
                    collated_alpha_diversity_fp,
                    mapping_fp,
                    category,
                    alpha_comparison_output_fp,
                    params_str,
                )
                commands.append([("Compare alpha diversity (%s, %s)" % (category, alpha_metric), compare_alpha_cmd)])
                index_links.append(
                    (
                        "Alpha diversity statistics (%s, %s)" % (category, alpha_metric),
                        alpha_comparison_output_fp,
                        _index_headers["alpha_diversity"],
                    )
                )

    if not suppress_taxa_summary:
        taxa_plots_output_dir = "%s/taxa_plots/" % output_dir
        run_summarize_taxa_through_plots(
            otu_table_fp=biom_fp,
            mapping_fp=mapping_fp,
            output_dir=taxa_plots_output_dir,
            mapping_cat=None,
            sort=True,
            command_handler=command_handler,
            params=params,
            qiime_config=qiime_config,
            logger=logger,
            suppress_md5=True,
            status_update_callback=status_update_callback,
        )

        index_links.append(
            (
                "Taxa summary bar plots",
                "%s/taxa_summary_plots/bar_charts.html" % taxa_plots_output_dir,
                _index_headers["taxa_summary"],
            )
        )
        index_links.append(
            (
                "Taxa summary area plots",
                "%s/taxa_summary_plots/area_charts.html" % taxa_plots_output_dir,
                _index_headers["taxa_summary"],
            )
        )
        for category in categories:
            taxa_plots_output_dir = "%s/taxa_plots_%s/" % (output_dir, category)
            run_summarize_taxa_through_plots(
                otu_table_fp=biom_fp,
                mapping_fp=mapping_fp,
                output_dir=taxa_plots_output_dir,
                mapping_cat=category,
                sort=True,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                logger=logger,
                suppress_md5=True,
                status_update_callback=status_update_callback,
            )

            index_links.append(
                (
                    "Taxa summary bar plots",
                    "%s/taxa_summary_plots/bar_charts.html" % taxa_plots_output_dir,
                    _index_headers["taxa_summary_categorical"] % category,
                )
            )
            index_links.append(
                (
                    "Taxa summary area plots",
                    "%s/taxa_summary_plots/area_charts.html" % taxa_plots_output_dir,
                    _index_headers["taxa_summary_categorical"] % category,
                )
            )

    if not suppress_otu_category_significance:
        # OTU category significance
        for category in categories:
            category_signifance_fp = "%s/category_significance_%s.txt" % (output_dir, category)
            try:
                params_str = get_params_str(params["otu_category_significance"])
            except KeyError:
                params_str = ""
            # Build the OTU cateogry significance command
            category_significance_cmd = "otu_category_significance.py -i %s -m %s -c %s -o %s %s" % (
                biom_fp,
                mapping_fp,
                category,
                category_signifance_fp,
                params_str,
            )
            commands.append([("OTU category significance (%s)" % category, category_significance_cmd)])

            index_links.append(
                ("Category significance (%s)" % category, category_signifance_fp, _index_headers["otu_category_sig"])
            )

    commands.append([("Compress the filtered BIOM table", "gzip %s" % filtered_biom_fp)])
    index_links.append(
        (
            "Filtered BIOM table (minimum sequence count: %d)" % sampling_depth,
            "%s.gz" % filtered_biom_fp,
            _index_headers["run_summary"],
        )
    )

    command_handler(commands, status_update_callback, logger)
    generate_index_page(index_links, index_fp)
示例#22
0
def preprocess_mapping_file(data, headers, columns, unique=False, single=False, clones=0):
    """Process a mapping file to expand the data or remove unuseful fields

    Inputs:
    data: mapping file data
    headers: mapping file headers
    columns: list of headers to keep, if one of these headers includes two
    ampersands, this function will create a new column by merging the delimited
    columns.
    unique: keep columns where all values are unique
    single: keep columns where all values are the same
    clones: number of times to replicate the metadata

    Outputs:
    data: processed mapping file data
    headers: processed mapping file headers
    """

    # The sample ID must always be there, else it's meaningless data
    if "SampleID" != columns[0]:
        columns = ["SampleID"] + columns

    # process concatenated columns if needed
    merge = []
    for column in columns:
        if "&&" in column:
            merge.append(column)
    # each element needs several columns to be merged
    for new_column in merge:
        indices = [headers.index(header_name) for header_name in new_column.split("&&")]

        # join all the fields of the metadata that are listed in indices
        for line in data:
            line.append("".join([line[index] for index in indices]))
        headers.append(new_column)

    # remove all unique or singled valued columns
    if unique or single:
        columns_to_remove = []
        metadata = MetadataMap(mapping_file_to_dict(data, headers), [])

        # find columns that have values that are all unique
        if unique == True:
            columns_to_remove += [
                column_name for column_name in headers[1::] if metadata.hasUniqueCategoryValues(column_name)
            ]

        # remove categories where there is only one value
        if single == True:
            columns_to_remove += [
                column_name for column_name in headers[1::] if metadata.hasSingleCategoryValue(column_name)
            ]
        columns_to_remove = list(set(columns_to_remove))

        # remove the single or unique columns
        data, headers = keep_columns_from_mapping_file(data, headers, columns_to_remove, negate=True)

    # remove anything not specified in the input
    data, headers = keep_columns_from_mapping_file(data, headers, columns)

    # sanitize the mapping file data and headers
    data, headers = sanitize_mapping_file(data, headers)

    # clones mean: replicate the metadata retagging the sample ids with a suffix
    if clones:
        out_data = []
        for index in range(0, clones):
            out_data.extend([[element[0] + "_%d" % index] + element[1::] for element in data])
        data = out_data

    return data, headers
示例#23
0
            if '&&' in col:
                for _col in col.split('&&'):
                    if _col not in lookup_header:
                        offending_fields.append(col)
            elif col not in lookup_header:
                offending_fields.append(col)
    else:
        # if the user didn't specify the header names display everything
        color_by_column_names = header[:]

    # extract a list of the custom axes provided and each element is numeric
    if custom_axes:
        custom_axes = custom_axes.strip().strip("'").strip('"').split(',')

        # the MetadataMap object makes some checks easier
        map_object = MetadataMap(mapping_file_to_dict(mapping_data, header), [])
        for axis in custom_axes:
            # append the field to the error queue that it belongs to
            if axis not in lookup_header:
                offending_fields.append(axis)
                break
            # make sure this value is in the mapping file
            elif axis not in color_by_column_names:
                color_by_column_names.append(axis)
        # perform only if the for loop does not call break
        else:
            # make sure all these axes are numeric
            for axis in custom_axes:
                if map_object.isNumericCategory(axis) == False:
                    non_numeric_categories.append(axis)
示例#24
0
def run_core_diversity_analyses(
    biom_fp,
    mapping_fp,
    sampling_depth,
    output_dir,
    qiime_config,
    command_handler=call_commands_serially,
    tree_fp=None,
    params=None,
    categories=None,
    arare_min_rare_depth=10,
    arare_num_steps=10,
    parallel=False,
    status_update_callback=print_to_stdout):
    """
    """

    if categories != None:
        # Validate categories provided by the users
        mapping_data, mapping_comments = \
         parse_mapping_file_to_dict(open(mapping_fp,'U'))
        metadata_map = MetadataMap(mapping_data, mapping_comments)
        for c in categories:
            if c not in metadata_map.CategoryNames:
                raise ValueError, ("Category '%s' is not a column header "
                 "in your mapping file. "
                 "Categories are case and white space sensitive. Valid "
                 "choices are: (%s)" % (c,', '.join(metadata_map.CategoryNames)))
            if metadata_map.hasSingleCategoryValue(c):
                raise ValueError, ("Category '%s' contains only one value. "
                 "Categories analyzed here require at least two values." % c)
            
    else:
        categories= []
    
    # prep some variables
    if params == None:
        params = parse_qiime_parameters([])
        
    create_dir(output_dir)
    index_fp = '%s/index.html' % output_dir
    index_links = []
    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    
    # begin logging
    log_fp = generate_log_fp(output_dir)
    index_links.append(('Master run log',log_fp,'Log files'))
    logger = WorkflowLogger(log_fp,
                            params=params,
                            qiime_config=qiime_config)
    input_fps = [biom_fp,mapping_fp]
    if tree_fp != None:
        input_fps.append(tree_fp)
    log_input_md5s(logger,input_fps)
    
    
    bdiv_even_output_dir = '%s/bdiv_even%d/' % (output_dir,sampling_depth)
    even_dm_fps = run_beta_diversity_through_plots(
     otu_table_fp=biom_fp, 
     mapping_fp=mapping_fp,
     output_dir=bdiv_even_output_dir,
     command_handler=command_handler,
     params=params,
     qiime_config=qiime_config,
     sampling_depth=sampling_depth,
     # force suppression of distance histograms - boxplots work better
     # in this context, and are created below.
     histogram_categories=[],
     tree_fp=tree_fp,
     parallel=parallel,
     logger=logger,
     status_update_callback=status_update_callback)
    
    for bdiv_metric, dm_fp in even_dm_fps:
        for category in categories:
            boxplots_output_dir = '%s/%s_boxplots/' % (bdiv_even_output_dir,bdiv_metric)
            try:
                params_str = get_params_str(params['make_distance_boxplots'])
            except KeyError:
                params_str = ''
            boxplots_cmd = \
             'make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s' %\
             (dm_fp, category, boxplots_output_dir, mapping_fp, params_str)
            commands.append([('Boxplots (%s)' % category,
                              boxplots_cmd)])
            index_links.append(('Distance boxplots (%s)' % bdiv_metric,
                                '%s/%s_Distances.pdf' % \
                                 (boxplots_output_dir,category),
                                'Beta diversity results (even sampling: %d)' % sampling_depth))
            index_links.append(('Distance boxplots statistics (%s)' % bdiv_metric,
                                '%s/%s_Stats.txt' % \
                                 (boxplots_output_dir,category),
                                'Beta diversity results (even sampling: %d)' % sampling_depth))
            
        index_links.append(('3D plot (%s, continuous coloring)' % bdiv_metric,
                            '%s/%s_3d_continuous/%s_pc_3D_PCoA_plots.html' % \
                             (bdiv_even_output_dir,bdiv_metric,bdiv_metric),
                            'Beta diversity results (even sampling: %d)' % sampling_depth))
        index_links.append(('3D plot (%s, discrete coloring)' % bdiv_metric,
                            '%s/%s_3d_discrete/%s_pc_3D_PCoA_plots.html' % \
                             (bdiv_even_output_dir,bdiv_metric,bdiv_metric),
                            'Beta diversity results (even sampling: %d)' % sampling_depth))
        index_links.append(('2D plot (%s, continuous coloring)' % bdiv_metric,
                            '%s/%s_2d_continuous/%s_pc_2D_PCoA_plots.html' % \
                             (bdiv_even_output_dir,bdiv_metric,bdiv_metric),
                            'Beta diversity results (even sampling: %d)' % sampling_depth))
        index_links.append(('2D plot (%s, discrete coloring)' % bdiv_metric,
                            '%s/%s_2d_discrete/%s_pc_2D_PCoA_plots.html' % \
                             (bdiv_even_output_dir,bdiv_metric,bdiv_metric),
                            'Beta diversity results (even sampling: %d)' % sampling_depth))
        index_links.append(('Distance matrix (%s)' % bdiv_metric,
                            '%s/%s_dm.txt' % \
                             (bdiv_even_output_dir,bdiv_metric),
                            'Beta diversity results (even sampling: %d)' % sampling_depth))
        index_links.append(('Principal coordinate matrix (%s)' % bdiv_metric,
                            '%s/%s_pc.txt' % \
                             (bdiv_even_output_dir,bdiv_metric),
                            'Beta diversity results (even sampling: %d)' % sampling_depth))
        
    ## Alpha rarefaction workflow
    arare_full_output_dir = '%s/arare_max%d/' % (output_dir,sampling_depth)
    run_qiime_alpha_rarefaction(
     otu_table_fp=biom_fp,
     mapping_fp=mapping_fp,
     output_dir=arare_full_output_dir,
     command_handler=command_handler,
     params=params,
     qiime_config=qiime_config,
     tree_fp=tree_fp,
     num_steps=arare_num_steps,
     parallel=parallel,
     logger=logger,
     min_rare_depth=arare_min_rare_depth,
     max_rare_depth=sampling_depth,
     status_update_callback=status_update_callback)
    
    index_links.append(('Alpha rarefaction plots',
                        '%s/alpha_rarefaction_plots/rarefaction_plots.html'\
                          % arare_full_output_dir,
                        "Alpha rarefaction results"))
                        
    collated_alpha_diversity_fps = \
     glob('%s/alpha_div_collated/*txt' % arare_full_output_dir)
    try:
        params_str = get_params_str(params['compare_alpha_diversity'])
    except KeyError:
        params_str = ''
    for c in categories:
        for collated_alpha_diversity_fp in collated_alpha_diversity_fps:
            alpha_metric = splitext(split(collated_alpha_diversity_fp)[1])[0]
            alpha_comparison_output_fp = '%s/%s_%s.txt' % \
             (arare_full_output_dir,c,alpha_metric)
            compare_alpha_cmd = \
             'compare_alpha_diversity.py -i %s -m %s -c %s -d %s -o %s -n 999 %s' %\
             (collated_alpha_diversity_fp, mapping_fp, c, 
              sampling_depth, alpha_comparison_output_fp, params_str)
            commands.append([('Compare alpha diversity (%s, %s)' %\
                               (category,alpha_metric),
                              compare_alpha_cmd)])
            index_links.append(
             ('Alpha diversity statistics (%s, %s)' % (category,alpha_metric),
              alpha_comparison_output_fp,
              "Alpha rarefaction results"))
    
    taxa_plots_output_dir = '%s/taxa_plots/' % output_dir
    run_summarize_taxa_through_plots(
     otu_table_fp=biom_fp,
     mapping_fp=mapping_fp,
     output_dir=taxa_plots_output_dir,
     mapping_cat=None, 
     sort=True,
     command_handler=command_handler,
     params=params,
     qiime_config=qiime_config,
     logger=logger, 
     status_update_callback=status_update_callback)
    

    index_links.append(('Taxa summary bar plots',
                        '%s/taxa_summary_plots/bar_charts.html'\
                          % taxa_plots_output_dir,
                        "Taxonomic summary results"))
    index_links.append(('Taxa summary area plots',
                        '%s/taxa_summary_plots/area_charts.html'\
                          % taxa_plots_output_dir,
                        "Taxonomic summary results"))
    for c in categories:
        taxa_plots_output_dir = '%s/taxa_plots_%s/' % (output_dir,c)
        run_summarize_taxa_through_plots(
         otu_table_fp=biom_fp,
         mapping_fp=mapping_fp,
         output_dir=taxa_plots_output_dir,
         mapping_cat=c, 
         sort=True,
         command_handler=command_handler,
         params=params,
         qiime_config=qiime_config,
         logger=logger, 
         status_update_callback=status_update_callback)

        index_links.append(('Taxa summary bar plots',
                            '%s/taxa_summary_plots/bar_charts.html'\
                              % taxa_plots_output_dir,
                            "Taxonomic summary results (by %s)" % c))
        index_links.append(('Taxa summary area plots',
                            '%s/taxa_summary_plots/area_charts.html'\
                              % taxa_plots_output_dir,
                            "Taxonomic summary results (by %s)" % c))
    
    # OTU category significance
    for category in categories:
        category_signifance_fp = \
         '%s/category_significance_%s.txt' % (output_dir, category)
        try:
            params_str = get_params_str(params['otu_category_significance'])
        except KeyError:
            params_str = ''
        # Build the OTU cateogry significance command
        category_significance_cmd = \
         'otu_category_significance.py -i %s -m %s -c %s -o %s %s' %\
         (biom_fp, mapping_fp, category, 
          category_signifance_fp, params_str)
        commands.append([('OTU category significance (%s)' % category, 
                          category_significance_cmd)])
                          
        index_links.append(('Category significance (%s)' % category,
                    category_signifance_fp,
                    "Category results"))
    
    command_handler(commands, status_update_callback, logger)
    generate_index_page(index_links,index_fp)
示例#25
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    if opts.show_indices:
        for idx_key in sorted(known_indices):
            idx = known_indices[idx_key]
            print "%s: %s, %s" % (idx_key, idx['name'], idx['source'])
            print '\t', 'increased:'
            print '\n'.join(['\t\t%s' % t for t in idx['increased']])
            print '\t', 'decreased:'
            print '\n'.join(['\t\t%s' % t for t in idx['decreased']])
        exit(0)

    if opts.index is not None and known_indices.get(opts.index) is None:
        option_parser.error("%s is not a known index. Known indices are: %s" %
                            (opts.index, ','.join(known_indices.keys())))

    if opts.index is not None and (opts.increased or opts.decreased):
        option_parser.error("Cannot specify both an existing and custom index")

    if opts.index is None and opts.increased is None and \
            opts.decreased is None:
        option_parser.error("Must specify an existing or custom index")

    if opts.increased and opts.decreased is None:
        option_parser.error("Must specify decreased taxa")

    if opts.decreased and opts.increased is None:
        option_parser.error("Must specify increased taxa")

    if opts.index is not None:
        name = opts.name if opts.name is not None else opts.index
        increased = known_indices[opts.index]['increased']
        decreased = known_indices[opts.index]['decreased']
    else:
        name = opts.name if opts.name is not None else 'index'
        increased = set(opts.increased.split(','))
        decreased = set(opts.decreased.split(','))

    if opts.input is None:
        option_parser.error("Input not specified")

    if opts.output is None:
        option_parser.error("Output not specified")

    table = load_table(opts.input)

    if opts.mapping_file:
        mapping_file = open(opts.mapping_file, 'U')
        output_file = TemporaryFile()
    else:
        mapping_file = None
        output_file = open(opts.output, 'w')

    output_file.write("#SampleID\t%s\n" % name)
    for id_, value in compute_index(table, increased, decreased, opts.key):
        output_file.write("%s\t%f\n" % (id_, value))

    if opts.mapping_file:
        output_file.seek(0)
        mapping_data = MetadataMap.mergeMappingFiles(
            [output_file, mapping_file], no_data_value=nan)
        with open(opts.output, 'w') as f:
            f.write(str(mapping_data))

    output_file.close()
def run_core_diversity_analyses(
    biom_fp,
    mapping_fp,
    sampling_depth,
    output_dir,
    qiime_config,
    command_handler=call_commands_serially,
    tree_fp=None,
    params=None,
    categories=None,
    arare_min_rare_depth=10,
    arare_num_steps=10,
    parallel=False,
    suppress_taxa_summary=False,
    suppress_beta_diversity=False,
    suppress_alpha_diversity=False,
    suppress_otu_category_significance=False,
    status_update_callback=print_to_stdout):
    """
    """
    if categories != None:
        # Validate categories provided by the users
        mapping_data, mapping_comments = \
         parse_mapping_file_to_dict(open(mapping_fp,'U'))
        metadata_map = MetadataMap(mapping_data, mapping_comments)
        for c in categories:
            if c not in metadata_map.CategoryNames:
                raise ValueError, ("Category '%s' is not a column header "
                 "in your mapping file. "
                 "Categories are case and white space sensitive. Valid "
                 "choices are: (%s)" % (c,', '.join(metadata_map.CategoryNames)))
            if metadata_map.hasSingleCategoryValue(c):
                raise ValueError, ("Category '%s' contains only one value. "
                 "Categories analyzed here require at least two values." % c)
            
    else:
        categories= []
    
    # prep some variables
    if params == None:
        params = parse_qiime_parameters([])
        
    create_dir(output_dir)
    index_fp = '%s/index.html' % output_dir
    index_links = []
    commands = []
    
    # begin logging
    old_log_fps = glob(join(output_dir,'log_20*txt'))
    log_fp = generate_log_fp(output_dir)
    index_links.append(('Master run log',log_fp,_index_headers['run_summary']))
    for old_log_fp in old_log_fps:
        index_links.append(('Previous run log',old_log_fp,_index_headers['run_summary']))
    logger = WorkflowLogger(log_fp,
                            params=params,
                            qiime_config=qiime_config)
    input_fps = [biom_fp,mapping_fp]
    if tree_fp != None:
        input_fps.append(tree_fp)
    log_input_md5s(logger,input_fps)

    # run 'biom summarize-table' on input BIOM table
    try:
        params_str = get_params_str(params['biom-summarize-table'])
    except KeyError:
        params_str = ''
    biom_table_stats_output_fp = '%s/biom_table_summary.txt' % output_dir
    if not exists(biom_table_stats_output_fp):
        biom_table_summary_cmd = \
         "biom summarize-table -i %s -o %s --suppress-md5 %s" % \
         (biom_fp, biom_table_stats_output_fp,params_str)
        commands.append([('Generate BIOM table summary',
                          biom_table_summary_cmd)])
    else:
        logger.write("Skipping 'biom summarize-table' as %s exists.\n\n" \
                     % biom_table_stats_output_fp)
    index_links.append(('BIOM table statistics',
                        biom_table_stats_output_fp,
                        _index_headers['run_summary']))
    
    # filter samples with fewer observations than the requested sampling_depth. 
    # since these get filtered for some analyses (eg beta diversity after
    # even sampling) it's useful to filter them here so they're filtered 
    # from all analyses.
    filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth)
    if not exists(filtered_biom_fp):
        filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" %\
         (biom_fp,filtered_biom_fp,sampling_depth)
        commands.append([('Filter low sequence count samples from table (minimum sequence count: %d)' % sampling_depth,
                          filter_samples_cmd)])
    else:
        logger.write("Skipping filter_samples_from_otu_table.py as %s exists.\n\n" \
                     % filtered_biom_fp)
    biom_fp = filtered_biom_fp
    
    # run initial commands and reset the command list
    if len(commands) > 0:
        command_handler(commands, 
                        status_update_callback, 
                        logger,
                        close_logger_on_success=False)
        commands = []
    
    if not suppress_beta_diversity:
        bdiv_even_output_dir = '%s/bdiv_even%d/' % (output_dir,sampling_depth)
        # Need to check for the existence of any distance matrices, since the user 
        # can select which will be generated.
        existing_dm_fps = glob('%s/*_dm.txt' % bdiv_even_output_dir)
        if len(existing_dm_fps) == 0:
            even_dm_fps = run_beta_diversity_through_plots(
             otu_table_fp=biom_fp, 
             mapping_fp=mapping_fp,
             output_dir=bdiv_even_output_dir,
             command_handler=command_handler,
             params=params,
             qiime_config=qiime_config,
             sampling_depth=sampling_depth,
             tree_fp=tree_fp,
             parallel=parallel,
             logger=logger,
             suppress_md5=True,
             status_update_callback=status_update_callback)
        else:
            logger.write("Skipping beta_diversity_through_plots.py as %s exist(s).\n\n" \
                         % ', '.join(existing_dm_fps))
            even_dm_fps = [(split(fp)[1].strip('_dm.txt'),fp) for fp in existing_dm_fps]
        
        # Get make_distance_boxplots parameters
        try:
            params_str = get_params_str(params['make_distance_boxplots'])
        except KeyError:
            params_str = ''
        
        for bdiv_metric, dm_fp in even_dm_fps:
            for category in categories:
                boxplots_output_dir = '%s/%s_boxplots/' % (bdiv_even_output_dir,bdiv_metric)
                plot_output_fp = '%s/%s_Distances.pdf' % (boxplots_output_dir,category)
                stats_output_fp = '%s/%s_Stats.txt' % (boxplots_output_dir,category)
                if not exists(plot_output_fp):
                    boxplots_cmd = \
                     'make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s' %\
                     (dm_fp, category, boxplots_output_dir, mapping_fp, params_str)
                    commands.append([('Boxplots (%s)' % category,
                                      boxplots_cmd)])
                else:
                    logger.write("Skipping make_distance_boxplots.py for %s as %s exists.\n\n" \
                                 % (category, plot_output_fp))
                index_links.append(('Distance boxplots (%s)' % bdiv_metric,
                                    plot_output_fp,
                                    _index_headers['beta_diversity_even'] % sampling_depth))
                index_links.append(('Distance boxplots statistics (%s)' % bdiv_metric,
                                    stats_output_fp,
                                    _index_headers['beta_diversity_even'] % sampling_depth))
            
            index_links.append(('PCoA plot (%s)' % bdiv_metric,
                                '%s/%s_emperor_pcoa_plot/index.html' % \
                                 (bdiv_even_output_dir,bdiv_metric),
                                _index_headers['beta_diversity_even'] % sampling_depth))
            index_links.append(('Distance matrix (%s)' % bdiv_metric,
                                '%s/%s_dm.txt' % \
                                 (bdiv_even_output_dir,bdiv_metric),
                                _index_headers['beta_diversity_even'] % sampling_depth))
            index_links.append(('Principal coordinate matrix (%s)' % bdiv_metric,
                                '%s/%s_pc.txt' % \
                                 (bdiv_even_output_dir,bdiv_metric),
                                _index_headers['beta_diversity_even'] % sampling_depth))
    
    if not suppress_alpha_diversity:
        ## Alpha rarefaction workflow
        arare_full_output_dir = '%s/arare_max%d/' % (output_dir,sampling_depth)
        rarefaction_plots_output_fp = \
         '%s/alpha_rarefaction_plots/rarefaction_plots.html' % arare_full_output_dir
        if not exists(rarefaction_plots_output_fp):
            run_alpha_rarefaction(
             otu_table_fp=biom_fp,
             mapping_fp=mapping_fp,
             output_dir=arare_full_output_dir,
             command_handler=command_handler,
             params=params,
             qiime_config=qiime_config,
             tree_fp=tree_fp,
             num_steps=arare_num_steps,
             parallel=parallel,
             logger=logger,
             min_rare_depth=arare_min_rare_depth,
             max_rare_depth=sampling_depth,
             suppress_md5=True,
             status_update_callback=status_update_callback)
        else:
            logger.write("Skipping alpha_rarefaction.py as %s exists.\n\n" \
                         % rarefaction_plots_output_fp)
    
        index_links.append(('Alpha rarefaction plots',
                            rarefaction_plots_output_fp,
                            _index_headers['alpha_diversity']))
                        
        collated_alpha_diversity_fps = \
         glob('%s/alpha_div_collated/*txt' % arare_full_output_dir)
        try:
            params_str = get_params_str(params['compare_alpha_diversity'])
        except KeyError:
            params_str = ''
            
        for category in categories:
            for collated_alpha_diversity_fp in collated_alpha_diversity_fps:
                alpha_metric = splitext(split(collated_alpha_diversity_fp)[1])[0]
                alpha_comparison_output_fp = '%s/%s_%s.txt' % \
                 (arare_full_output_dir,category,alpha_metric)
                if not exists(alpha_comparison_output_fp):
                    compare_alpha_cmd = \
                     'compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s' %\
                     (collated_alpha_diversity_fp, mapping_fp, category, 
                      alpha_comparison_output_fp, params_str)
                    commands.append([('Compare alpha diversity (%s, %s)' %\
                                       (category,alpha_metric),
                                      compare_alpha_cmd)])
                else:
                    logger.write("Skipping compare_alpha_diversity.py for %s as %s exists.\n\n" \
                                 % (category, alpha_comparison_output_fp))
                index_links.append(
                 ('Alpha diversity statistics (%s, %s)' % (category,alpha_metric),
                  alpha_comparison_output_fp,
                  _index_headers['alpha_diversity']))
    
    if not suppress_taxa_summary:
        taxa_plots_output_dir = '%s/taxa_plots/' % output_dir
        # need to check for existence of any html files, since the user can 
        # select only certain ones to be generated
        existing_taxa_plot_html_fps = glob(join(output_dir,'taxa_summary_plots','*.html'))
        if len(existing_taxa_plot_html_fps) == 0:
            run_summarize_taxa_through_plots(
             otu_table_fp=biom_fp,
             mapping_fp=mapping_fp,
             output_dir=taxa_plots_output_dir,
             mapping_cat=None, 
             sort=True,
             command_handler=command_handler,
             params=params,
             qiime_config=qiime_config,
             logger=logger,
             suppress_md5=True,
             status_update_callback=status_update_callback)
        else:
            logger.write("Skipping summarize_taxa_through_plots.py for as %s exist(s).\n\n" \
                         % ', '.join(existing_taxa_plot_html_fps))

        index_links.append(('Taxa summary bar plots',
                            '%s/taxa_summary_plots/bar_charts.html'\
                              % taxa_plots_output_dir,
                            _index_headers['taxa_summary']))
        index_links.append(('Taxa summary area plots',
                            '%s/taxa_summary_plots/area_charts.html'\
                              % taxa_plots_output_dir,
                            _index_headers['taxa_summary']))
        for category in categories:
            taxa_plots_output_dir = '%s/taxa_plots_%s/' % (output_dir,category)
            # need to check for existence of any html files, since the user can 
            # select only certain ones to be generated
            existing_taxa_plot_html_fps = glob('%s/taxa_summary_plots/*.html' % taxa_plots_output_dir)
            if len(existing_taxa_plot_html_fps) == 0:
                run_summarize_taxa_through_plots(
                 otu_table_fp=biom_fp,
                 mapping_fp=mapping_fp,
                 output_dir=taxa_plots_output_dir,
                 mapping_cat=category, 
                 sort=True,
                 command_handler=command_handler,
                 params=params,
                 qiime_config=qiime_config,
                 logger=logger,
                 suppress_md5=True,
                 status_update_callback=status_update_callback)
            else:
                logger.write("Skipping summarize_taxa_through_plots.py for %s as %s exist(s).\n\n" \
                             % (category, ', '.join(existing_taxa_plot_html_fps)))

            index_links.append(('Taxa summary bar plots',
                                '%s/taxa_summary_plots/bar_charts.html'\
                                  % taxa_plots_output_dir,
                                _index_headers['taxa_summary_categorical'] % category))
            index_links.append(('Taxa summary area plots',
                                '%s/taxa_summary_plots/area_charts.html'\
                                  % taxa_plots_output_dir,
                                _index_headers['taxa_summary_categorical'] % category))
    
    if not suppress_otu_category_significance:
        try:
            params_str = get_params_str(params['otu_category_significance'])
        except KeyError:
            params_str = ''
        # OTU category significance
        for category in categories:
            category_signifance_fp = \
             '%s/category_significance_%s.txt' % (output_dir, category)
            if not exists(category_signifance_fp):
                # Build the OTU cateogry significance command
                category_significance_cmd = \
                 'otu_category_significance.py -i %s -m %s -c %s -o %s %s' %\
                 (biom_fp, mapping_fp, category, 
                  category_signifance_fp, params_str)
                commands.append([('OTU category significance (%s)' % category, 
                                  category_significance_cmd)])
            else:
                logger.write("Skipping otu_category_significance.py for %s as %s exists.\n\n" \
                             % (category, category_signifance_fp))
            
            index_links.append(('Category significance (%s)' % category,
                        category_signifance_fp,
                        _index_headers['otu_category_sig']))
    filtered_biom_gzip_fp = '%s.gz' % filtered_biom_fp
    if not exists(filtered_biom_gzip_fp):
        commands.append([('Compress the filtered BIOM table','gzip %s' % filtered_biom_fp)])
        index_links.append(('Filtered BIOM table (minimum sequence count: %d)' % sampling_depth,
                            filtered_biom_gzip_fp,
                            _index_headers['run_summary']))
    else:
        logger.write("Skipping compressing of filtered BIOM table as %s exists.\n\n" \
                     % filtered_biom_gzip_fp)
    if len(commands) > 0:
        command_handler(commands, status_update_callback, logger)
    else:
        logger.close()
    
    generate_index_page(index_links,index_fp)
示例#27
0
def sample_ids_from_category_state_coverage(mapping_f,
                                            coverage_category,
                                            subject_category,
                                            min_num_states=None,
                                            required_states=None,
                                            considered_states=None,
                                            splitter_category=None):
    """Filter sample IDs based on subject's coverage of a category.

    Given a category that groups samples by subject (subject_category), samples
    are filtered by how well a subject covers (i.e. has at least one sample
    for) the category states in coverage_category.

    Two filtering criteria are provided (min_num_states and required_states). At
    least one must be provided. If both are provided, the subject must meet
    both criteria to pass the filter (i.e. providing both filters is an AND,
    not an OR, operation).

    A common use case is to provide a 'time' category for coverage_category and
    an 'individual' category for subject_category in order to filter out
    individuals from a study that do not have samples for some minimum number
    of timepoints (min_num_states) and that do not have samples for certain
    timepoints (required_states). For example, this could be the first and last
    timepoints in the study.

    Returns a set of sample IDs to keep, the number of subjects that were
    kept, and a set of the unique category states in coverage_category that
    were kept. The set of sample IDs is not guaranteed to be in any specific
    order relative to the order of sample IDs or subjects in the mapping file.

    Arguments:
        mapping_f - metadata mapping file (file-like object)
        coverage_category - category to test subjects' coverage (string)
        subject_category - category to group samples by subject (string)
        min_num_states - minimum number of category states in coverage_category
            that a subject must cover (i.e. have at least one sample for) to be
            included in results (integer)
        required_states - category states in coverage_category that must be
            covered by a subject's samples in order to be included in results
            (list of strings or items that can be converted to strings)
        considered_states - category states that are counted toward the 
            min_num_states (list of strings or items that can be converted to
            strings)
        splitter_category - category to split input mapping file on prior to
            processing. If not supplied, the mapping file will not be split. If
            supplied, a dictionary mapping splitter_category state to results
            will be returned instead of the three-element tuple. The supplied
            filtering criteria will apply to each split piece of the mapping
            file independently (e.g. if an individual passes the filters for
            the tongue samples, his/her tongue samples will be included for
            the tongue results, even if he/she doesn't pass the filters for the 
            palm samples)
    """
    metadata_map = MetadataMap.parseMetadataMap(mapping_f)

    # Make sure our input looks sane.
    categories_to_test = [coverage_category, subject_category]
    if splitter_category is not None:
        categories_to_test.append(splitter_category)

    if 'SampleID' in categories_to_test:
        raise ValueError("The 'SampleID' category is not suitable for use in "
                         "this function. Please choose a different category "
                         "from the metadata mapping file.")

    for category in categories_to_test:
        if category not in metadata_map.CategoryNames:
            raise ValueError("The category '%s' is not in the metadata "
                             "mapping file." % category)

    if len(set(categories_to_test)) < len(categories_to_test):
        raise ValueError("The coverage, subject, and (optional) splitter "
                         "categories must all be unique.")

    if required_states is not None:
        # required_states must be in coverage_category's states in the mapping
        # file.
        required_states = set(map(str,required_states))
        valid_coverage_states = set(metadata_map.getCategoryValues(
            metadata_map.SampleIds, coverage_category))
        invalid_coverage_states = required_states - valid_coverage_states

        if invalid_coverage_states:
            raise ValueError("The category state(s) '%s' are not in the '%s' "
                             "category in the metadata mapping file." %
                             (', '.join(invalid_coverage_states),
                              coverage_category))

    if considered_states is not None:
        # considered_states is not as restrictive as required_states - we don't 
        # require that these are present, so it's OK if some of the states
        # listed here don't actually show up in the mapping file (allowing
        # the user to pass something like range(100) to consider only states
        # that fall in some range)
        considered_states = set(map(str,considered_states))
        # define a function to determine if a state should be considered
        consider_state = lambda s: s in considered_states
    else:
        # define a dummy function to consider all states (the default
        # if the user does not provide a list of considered_states)
        consider_state = lambda s: True

    if min_num_states is None and required_states is None:
        raise ValueError("You must specify either the minimum number of "
                         "category states the subject must have samples for "
                         "(min_num_states), or the minimal category states "
                         "the subject must have samples for (required_states), "
                         "or both. Supplying neither filtering criteria is "
                         "not supported.")

    if splitter_category is None:
        results = _filter_sample_ids_from_category_state_coverage(
                metadata_map, metadata_map.SampleIds, coverage_category,
                subject_category, consider_state, min_num_states,
                required_states)
    else:
        # "Split" the metadata mapping file by extracting only sample IDs that
        # match the current splitter category state and using those for the
        # actual filtering.
        splitter_category_states = defaultdict(list)
        for samp_id in metadata_map.SampleIds:
            splitter_category_state = \
                    metadata_map.getCategoryValue(samp_id, splitter_category)
            splitter_category_states[splitter_category_state].append(samp_id)

        results = {}
        for splitter_category_state, sample_ids in \
            splitter_category_states.items():
            results[splitter_category_state] = \
                    _filter_sample_ids_from_category_state_coverage(
                            metadata_map, sample_ids, coverage_category,
                            subject_category, consider_state, min_num_states,
                            required_states)

    return results
示例#28
0
def run_core_diversity_analyses(
    biom_fp,
    mapping_fp,
    sampling_depth,
    output_dir,
    qiime_config,
    command_handler=call_commands_serially,
    tree_fp=None,
    params=None,
    categories=None,
    arare_min_rare_depth=10,
    arare_num_steps=10,
    parallel=False,
    suppress_taxa_summary=False,
    suppress_beta_diversity=False,
    suppress_alpha_diversity=False,
    suppress_group_significance=False,
    status_update_callback=print_to_stdout,
):
    """
    """
    if categories is not None:
        # Validate categories provided by the users
        mapping_data, mapping_comments = parse_mapping_file_to_dict(open(mapping_fp, "U"))
        metadata_map = MetadataMap(mapping_data, mapping_comments)
        for c in categories:
            if c not in metadata_map.CategoryNames:
                raise ValueError(
                    "Category '%s' is not a column header "
                    "in your mapping file. "
                    "Categories are case and white space sensitive. Valid "
                    "choices are: (%s)" % (c, ", ".join(metadata_map.CategoryNames))
                )
            if metadata_map.hasSingleCategoryValue(c):
                raise ValueError(
                    "Category '%s' contains only one value. "
                    "Categories analyzed here require at least two values." % c
                )

    else:
        categories = []
    comma_separated_categories = ",".join(categories)
    # prep some variables
    if params is None:
        params = parse_qiime_parameters([])

    create_dir(output_dir)
    index_fp = "%s/index.html" % output_dir
    index_links = []
    commands = []

    # begin logging
    old_log_fps = glob(join(output_dir, "log_20*txt"))
    log_fp = generate_log_fp(output_dir)
    index_links.append(("Master run log", log_fp, _index_headers["run_summary"]))
    for old_log_fp in old_log_fps:
        index_links.append(("Previous run log", old_log_fp, _index_headers["run_summary"]))
    logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config)
    input_fps = [biom_fp, mapping_fp]
    if tree_fp is not None:
        input_fps.append(tree_fp)
    log_input_md5s(logger, input_fps)

    # run 'biom summarize-table' on input BIOM table
    try:
        params_str = get_params_str(params["biom-summarize-table"])
    except KeyError:
        params_str = ""
    biom_table_stats_output_fp = "%s/biom_table_summary.txt" % output_dir
    if not exists(biom_table_stats_output_fp):
        biom_table_summary_cmd = "biom summarize-table -i %s -o %s --suppress-md5 %s" % (
            biom_fp,
            biom_table_stats_output_fp,
            params_str,
        )
        commands.append([("Generate BIOM table summary", biom_table_summary_cmd)])
    else:
        logger.write("Skipping 'biom summarize-table' as %s exists.\n\n" % biom_table_stats_output_fp)
    index_links.append(("BIOM table statistics", biom_table_stats_output_fp, _index_headers["run_summary"]))

    # filter samples with fewer observations than the requested sampling_depth.
    # since these get filtered for some analyses (eg beta diversity after
    # even sampling) it's useful to filter them here so they're filtered
    # from all analyses.
    filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth)
    if not exists(filtered_biom_fp):
        filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" % (
            biom_fp,
            filtered_biom_fp,
            sampling_depth,
        )
        commands.append(
            [
                (
                    "Filter low sequence count samples from table (minimum sequence count: %d)" % sampling_depth,
                    filter_samples_cmd,
                )
            ]
        )
    else:
        logger.write("Skipping filter_samples_from_otu_table.py as %s exists.\n\n" % filtered_biom_fp)
    biom_fp = filtered_biom_fp

    # rarify the BIOM table to sampling_depth
    rarefied_biom_fp = "%s/table_even%d.biom" % (output_dir, sampling_depth)
    if not exists(rarefied_biom_fp):
        single_rarefaction_cmd = "single_rarefaction.py -i %s -o %s -d %d" % (biom_fp, rarefied_biom_fp, sampling_depth)
        commands.append([("Rarify the OTU table to %d sequences/sample" % sampling_depth, single_rarefaction_cmd)])
    else:
        logger.write("Skipping single_rarefaction.py as %s exists.\n\n" % rarefied_biom_fp)

    # run initial commands and reset the command list
    if len(commands) > 0:
        command_handler(commands, status_update_callback, logger, close_logger_on_success=False)
        commands = []

    if not suppress_beta_diversity:
        bdiv_even_output_dir = "%s/bdiv_even%d/" % (output_dir, sampling_depth)
        # Need to check for the existence of any distance matrices, since the user
        # can select which will be generated.
        existing_dm_fps = glob("%s/*_dm.txt" % bdiv_even_output_dir)
        if len(existing_dm_fps) == 0:
            even_dm_fps = run_beta_diversity_through_plots(
                otu_table_fp=rarefied_biom_fp,
                mapping_fp=mapping_fp,
                output_dir=bdiv_even_output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                # Note: we pass sampling depth=None here as
                # we rarify the BIOM table above and pass that
                # in here.
                sampling_depth=None,
                tree_fp=tree_fp,
                parallel=parallel,
                logger=logger,
                suppress_md5=True,
                status_update_callback=status_update_callback,
            )
        else:
            logger.write("Skipping beta_diversity_through_plots.py as %s exist(s).\n\n" % ", ".join(existing_dm_fps))
            even_dm_fps = [(split(fp)[1].strip("_dm.txt"), fp) for fp in existing_dm_fps]

        # Get make_distance_boxplots parameters
        try:
            params_str = get_params_str(params["make_distance_boxplots"])
        except KeyError:
            params_str = ""

        for bdiv_metric, dm_fp in even_dm_fps:
            for category in categories:
                boxplots_output_dir = "%s/%s_boxplots/" % (bdiv_even_output_dir, bdiv_metric)
                plot_output_fp = "%s/%s_Distances.pdf" % (boxplots_output_dir, category)
                stats_output_fp = "%s/%s_Stats.txt" % (boxplots_output_dir, category)
                if not exists(plot_output_fp):
                    boxplots_cmd = "make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s" % (
                        dm_fp,
                        category,
                        boxplots_output_dir,
                        mapping_fp,
                        params_str,
                    )
                    commands.append([("Boxplots (%s)" % category, boxplots_cmd)])
                else:
                    logger.write(
                        "Skipping make_distance_boxplots.py for %s as %s exists.\n\n" % (category, plot_output_fp)
                    )
                index_links.append(
                    (
                        "Distance boxplots (%s)" % bdiv_metric,
                        plot_output_fp,
                        _index_headers["beta_diversity_even"] % sampling_depth,
                    )
                )
                index_links.append(
                    (
                        "Distance boxplots statistics (%s)" % bdiv_metric,
                        stats_output_fp,
                        _index_headers["beta_diversity_even"] % sampling_depth,
                    )
                )

            index_links.append(
                (
                    "PCoA plot (%s)" % bdiv_metric,
                    "%s/%s_emperor_pcoa_plot/index.html" % (bdiv_even_output_dir, bdiv_metric),
                    _index_headers["beta_diversity_even"] % sampling_depth,
                )
            )
            index_links.append(
                (
                    "Distance matrix (%s)" % bdiv_metric,
                    "%s/%s_dm.txt" % (bdiv_even_output_dir, bdiv_metric),
                    _index_headers["beta_diversity_even"] % sampling_depth,
                )
            )
            index_links.append(
                (
                    "Principal coordinate matrix (%s)" % bdiv_metric,
                    "%s/%s_pc.txt" % (bdiv_even_output_dir, bdiv_metric),
                    _index_headers["beta_diversity_even"] % sampling_depth,
                )
            )

    if not suppress_alpha_diversity:
        # Alpha rarefaction workflow
        arare_full_output_dir = "%s/arare_max%d/" % (output_dir, sampling_depth)
        rarefaction_plots_output_fp = "%s/alpha_rarefaction_plots/rarefaction_plots.html" % arare_full_output_dir
        if not exists(rarefaction_plots_output_fp):
            run_alpha_rarefaction(
                otu_table_fp=biom_fp,
                mapping_fp=mapping_fp,
                output_dir=arare_full_output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                tree_fp=tree_fp,
                num_steps=arare_num_steps,
                parallel=parallel,
                logger=logger,
                min_rare_depth=arare_min_rare_depth,
                max_rare_depth=sampling_depth,
                suppress_md5=True,
                status_update_callback=status_update_callback,
                retain_intermediate_files=False,
            )
        else:
            logger.write("Skipping alpha_rarefaction.py as %s exists.\n\n" % rarefaction_plots_output_fp)

        index_links.append(("Alpha rarefaction plots", rarefaction_plots_output_fp, _index_headers["alpha_diversity"]))

        collated_alpha_diversity_fps = glob("%s/alpha_div_collated/*txt" % arare_full_output_dir)
        try:
            params_str = get_params_str(params["compare_alpha_diversity"])
        except KeyError:
            params_str = ""

        if len(categories) > 0:
            for collated_alpha_diversity_fp in collated_alpha_diversity_fps:
                alpha_metric = splitext(split(collated_alpha_diversity_fp)[1])[0]
                compare_alpha_output_dir = "%s/compare_%s" % (arare_full_output_dir, alpha_metric)
                if not exists(compare_alpha_output_dir):
                    compare_alpha_cmd = "compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s" % (
                        collated_alpha_diversity_fp,
                        mapping_fp,
                        comma_separated_categories,
                        compare_alpha_output_dir,
                        params_str,
                    )
                    commands.append([("Compare alpha diversity (%s)" % alpha_metric, compare_alpha_cmd)])
                    for category in categories:
                        alpha_comparison_stat_fp = "%s/%s_stats.txt" % (compare_alpha_output_dir, category)
                        alpha_comparison_boxplot_fp = "%s/%s_boxplots.pdf" % (compare_alpha_output_dir, category)
                        index_links.append(
                            (
                                "Alpha diversity statistics (%s, %s)" % (category, alpha_metric),
                                alpha_comparison_stat_fp,
                                _index_headers["alpha_diversity"],
                            )
                        )
                        index_links.append(
                            (
                                "Alpha diversity boxplots (%s, %s)" % (category, alpha_metric),
                                alpha_comparison_boxplot_fp,
                                _index_headers["alpha_diversity"],
                            )
                        )
                else:
                    logger.write(
                        "Skipping compare_alpha_diversity.py"
                        " for %s as %s exists.\n\n" % (alpha_metric, compare_alpha_output_dir)
                    )
        else:
            logger.write("Skipping compare_alpha_diversity.py as" " no categories were provided.\n\n")

    if not suppress_taxa_summary:
        taxa_plots_output_dir = "%s/taxa_plots/" % output_dir
        # need to check for existence of any html files, since the user can
        # select only certain ones to be generated
        existing_taxa_plot_html_fps = glob(join(taxa_plots_output_dir, "taxa_summary_plots", "*.html"))
        if len(existing_taxa_plot_html_fps) == 0:
            run_summarize_taxa_through_plots(
                otu_table_fp=biom_fp,
                mapping_fp=mapping_fp,
                output_dir=taxa_plots_output_dir,
                mapping_cat=None,
                sort=True,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                logger=logger,
                suppress_md5=True,
                status_update_callback=status_update_callback,
            )
        else:
            logger.write(
                "Skipping summarize_taxa_through_plots.py for as %s exist(s).\n\n"
                % ", ".join(existing_taxa_plot_html_fps)
            )

        index_links.append(
            (
                "Taxa summary bar plots",
                "%s/taxa_summary_plots/bar_charts.html" % taxa_plots_output_dir,
                _index_headers["taxa_summary"],
            )
        )
        index_links.append(
            (
                "Taxa summary area plots",
                "%s/taxa_summary_plots/area_charts.html" % taxa_plots_output_dir,
                _index_headers["taxa_summary"],
            )
        )
        for category in categories:
            taxa_plots_output_dir = "%s/taxa_plots_%s/" % (output_dir, category)
            # need to check for existence of any html files, since the user can
            # select only certain ones to be generated
            existing_taxa_plot_html_fps = glob("%s/taxa_summary_plots/*.html" % taxa_plots_output_dir)
            if len(existing_taxa_plot_html_fps) == 0:
                run_summarize_taxa_through_plots(
                    otu_table_fp=biom_fp,
                    mapping_fp=mapping_fp,
                    output_dir=taxa_plots_output_dir,
                    mapping_cat=category,
                    sort=True,
                    command_handler=command_handler,
                    params=params,
                    qiime_config=qiime_config,
                    logger=logger,
                    suppress_md5=True,
                    status_update_callback=status_update_callback,
                )
            else:
                logger.write(
                    "Skipping summarize_taxa_through_plots.py for %s as %s exist(s).\n\n"
                    % (category, ", ".join(existing_taxa_plot_html_fps))
                )

            index_links.append(
                (
                    "Taxa summary bar plots",
                    "%s/taxa_summary_plots/bar_charts.html" % taxa_plots_output_dir,
                    _index_headers["taxa_summary_categorical"] % category,
                )
            )
            index_links.append(
                (
                    "Taxa summary area plots",
                    "%s/taxa_summary_plots/area_charts.html" % taxa_plots_output_dir,
                    _index_headers["taxa_summary_categorical"] % category,
                )
            )

    if not suppress_group_significance:
        params_str = get_params_str(params["group_significance"])
        # group significance tests, aka category significance
        for category in categories:
            group_signifance_fp = "%s/group_significance_%s.txt" % (output_dir, category)
            if not exists(group_signifance_fp):
                # Build the OTU cateogry significance command
                group_significance_cmd = "group_significance.py -i %s -m %s -c %s -o %s %s" % (
                    rarefied_biom_fp,
                    mapping_fp,
                    category,
                    group_signifance_fp,
                    params_str,
                )
                commands.append([("Group significance (%s)" % category, group_significance_cmd)])
            else:
                logger.write(
                    "Skipping group_significance.py for %s as %s exists.\n\n" % (category, group_signifance_fp)
                )

            index_links.append(
                ("Category significance (%s)" % category, group_signifance_fp, _index_headers["group_significance"])
            )

    filtered_biom_gzip_fp = "%s.gz" % filtered_biom_fp
    if not exists(filtered_biom_gzip_fp):
        commands.append([("Compress the filtered BIOM table", "gzip %s" % filtered_biom_fp)])
    else:
        logger.write("Skipping compressing of filtered BIOM table as %s exists.\n\n" % filtered_biom_gzip_fp)
    index_links.append(
        (
            "Filtered BIOM table (minimum sequence count: %d)" % sampling_depth,
            filtered_biom_gzip_fp,
            _index_headers["run_summary"],
        )
    )

    rarified_biom_gzip_fp = "%s.gz" % rarefied_biom_fp
    if not exists(rarified_biom_gzip_fp):
        commands.append([("Compress the rarified BIOM table", "gzip %s" % rarefied_biom_fp)])
    else:
        logger.write("Skipping compressing of rarified BIOM table as %s exists.\n\n" % rarified_biom_gzip_fp)
    index_links.append(
        (
            "Rarified BIOM table (sampling depth: %d)" % sampling_depth,
            rarified_biom_gzip_fp,
            _index_headers["run_summary"],
        )
    )

    if len(commands) > 0:
        command_handler(commands, status_update_callback, logger)
    else:
        logger.close()

    generate_index_page(index_links, index_fp)
示例#29
0
def sample_ids_from_category_state_coverage(mapping_f,
                                            coverage_category,
                                            subject_category,
                                            min_num_states=None,
                                            required_states=None,
                                            considered_states=None,
                                            splitter_category=None):
    """Filter sample IDs based on subject's coverage of a category.

    Given a category that groups samples by subject (subject_category), samples
    are filtered by how well a subject covers (i.e. has at least one sample
    for) the category states in coverage_category.

    Two filtering criteria are provided (min_num_states and required_states). At
    least one must be provided. If both are provided, the subject must meet
    both criteria to pass the filter (i.e. providing both filters is an AND,
    not an OR, operation).

    A common use case is to provide a 'time' category for coverage_category and
    an 'individual' category for subject_category in order to filter out
    individuals from a study that do not have samples for some minimum number
    of timepoints (min_num_states) and that do not have samples for certain
    timepoints (required_states). For example, this could be the first and last
    timepoints in the study.

    Returns a set of sample IDs to keep, the number of subjects that were
    kept, and a set of the unique category states in coverage_category that
    were kept. The set of sample IDs is not guaranteed to be in any specific
    order relative to the order of sample IDs or subjects in the mapping file.

    Arguments:
        mapping_f - metadata mapping file (file-like object)
        coverage_category - category to test subjects' coverage (string)
        subject_category - category to group samples by subject (string)
        min_num_states - minimum number of category states in coverage_category
            that a subject must cover (i.e. have at least one sample for) to be
            included in results (integer)
        required_states - category states in coverage_category that must be
            covered by a subject's samples in order to be included in results
            (list of strings or items that can be converted to strings)
        considered_states - category states that are counted toward the 
            min_num_states (list of strings or items that can be converted to
            strings)
        splitter_category - category to split input mapping file on prior to
            processing. If not supplied, the mapping file will not be split. If
            supplied, a dictionary mapping splitter_category state to results
            will be returned instead of the three-element tuple. The supplied
            filtering criteria will apply to each split piece of the mapping
            file independently (e.g. if an individual passes the filters for
            the tongue samples, his/her tongue samples will be included for
            the tongue results, even if he/she doesn't pass the filters for the 
            palm samples)
    """
    metadata_map = MetadataMap.parseMetadataMap(mapping_f)

    # Make sure our input looks sane.
    categories_to_test = [coverage_category, subject_category]
    if splitter_category is not None:
        categories_to_test.append(splitter_category)

    if 'SampleID' in categories_to_test:
        raise ValueError("The 'SampleID' category is not suitable for use in "
                         "this function. Please choose a different category "
                         "from the metadata mapping file.")

    for category in categories_to_test:
        if category not in metadata_map.CategoryNames:
            raise ValueError("The category '%s' is not in the metadata "
                             "mapping file." % category)

    if len(set(categories_to_test)) < len(categories_to_test):
        raise ValueError("The coverage, subject, and (optional) splitter "
                         "categories must all be unique.")

    if required_states is not None:
        # required_states must be in coverage_category's states in the mapping
        # file.
        required_states = set(map(str, required_states))
        valid_coverage_states = set(
            metadata_map.getCategoryValues(metadata_map.SampleIds,
                                           coverage_category))
        invalid_coverage_states = required_states - valid_coverage_states

        if invalid_coverage_states:
            raise ValueError(
                "The category state(s) '%s' are not in the '%s' "
                "category in the metadata mapping file." %
                (', '.join(invalid_coverage_states), coverage_category))

    if considered_states is not None:
        # considered_states is not as restrictive as required_states - we don't
        # require that these are present, so it's OK if some of the states
        # listed here don't actually show up in the mapping file (allowing
        # the user to pass something like range(100) to consider only states
        # that fall in some range)
        considered_states = set(map(str, considered_states))
        # define a function to determine if a state should be considered
        consider_state = lambda s: s in considered_states
    else:
        # define a dummy function to consider all states (the default
        # if the user does not provide a list of considered_states)
        consider_state = lambda s: True

    if min_num_states is None and required_states is None:
        raise ValueError(
            "You must specify either the minimum number of "
            "category states the subject must have samples for "
            "(min_num_states), or the minimal category states "
            "the subject must have samples for (required_states), "
            "or both. Supplying neither filtering criteria is "
            "not supported.")

    if splitter_category is None:
        results = _filter_sample_ids_from_category_state_coverage(
            metadata_map, metadata_map.SampleIds, coverage_category,
            subject_category, consider_state, min_num_states, required_states)
    else:
        # "Split" the metadata mapping file by extracting only sample IDs that
        # match the current splitter category state and using those for the
        # actual filtering.
        splitter_category_states = defaultdict(list)
        for samp_id in metadata_map.SampleIds:
            splitter_category_state = \
                    metadata_map.getCategoryValue(samp_id, splitter_category)
            splitter_category_states[splitter_category_state].append(samp_id)

        results = {}
        for splitter_category_state, sample_ids in \
            splitter_category_states.items():
            results[splitter_category_state] = \
                    _filter_sample_ids_from_category_state_coverage(
                            metadata_map, sample_ids, coverage_category,
                            subject_category, consider_state, min_num_states,
                            required_states)

    return results
示例#30
0
def _color_field_states(map_f, samp_ids, field, field_states, color_by_field):
    """Colors one field by another.

    Returns a list of matplotlib-compatible colors, one for each of the input
    field_states. Also returns a dictionary mapping color_by_field states to
    colors (useful for building a legend, for example).

    If there are not enough colors available (they are drawn from
    qiime.colors.data_colors), an error will be raised as the color mapping
    (and legend) will be ambiguous.

    A one-to-one mapping must exist between each field_state and its
    corresponding color_by field state (otherwise it is unclear which
    corresponding color_by field state should be used to color it by). An error
    will be raised if this one-to-one mapping does not exist.

    Arguments:
        map_f - the mapping file (file-like object)
        samp_ids - a list of sample IDs to consider in the mapping file. Only
            these sample IDs will be used when coloring field states
        field - the field in the mapping file to color
        field_states - the field states in field to color
        color_by_field - the field in the mapping file to color field_states by
    """
    colors = []
    color_pool = [matplotlib_rgb_color(data_colors[color].toRGB()) for color in data_color_order]
    metadata_map = MetadataMap.parseMetadataMap(map_f)

    for field_to_check in field, color_by_field:
        if field_to_check not in metadata_map.CategoryNames:
            raise ValueError("The field '%s' is not in the metadata mapping " "file's column headers." % field_to_check)

    all_field_states = metadata_map.getCategoryValues(samp_ids, field)
    all_color_by_states = metadata_map.getCategoryValues(samp_ids, color_by_field)

    if len(set(field_states) - set(all_field_states)) != 0:
        raise ValueError("Encountered unrecognizable field state(s) in %r " "for field '%s'." % (field_states, field))

    # Build mapping from one field to the other.
    field_mapping = defaultdict(list)
    for field_state, color_by_state in zip(all_field_states, all_color_by_states):
        if field_state in field_states:
            field_mapping[field_state].append(color_by_state)

    # For each of the specified input field states, find its corresponding
    # "color by" field state and give it a color if it hasn't been assigned one
    # yet. Make sure we have enough colors and there is a one-to-one mapping.
    color_mapping = {}
    for field_state in field_states:
        color_by_states = set(field_mapping[field_state])

        if len(color_by_states) != 1:
            raise ValueError(
                "The field '%s' to color by does not have a "
                "one-to-one mapping with field '%s'. Coloring "
                "would be ambiguous." % (color_by_field, field)
            )

        color_by_state = list(color_by_states)[0]
        if color_by_state not in color_mapping:
            if len(color_pool) > 0:
                color_mapping[color_by_state] = color_pool.pop(0)
            else:
                raise ValueError(
                    "There are not enough available QIIME colors "
                    "to color each of the field states in field "
                    "'%s'. Coloring would be ambiguous." % color_by_field
                )

        colors.append(color_mapping[color_by_state])

    return colors, color_mapping