示例#1
0
    def setUp(self):
        self.otu_map1 = [('0',['seq1','seq2','seq5']),
                         ('1',['seq3','seq4']),
                         ('2',['seq6','seq7','seq8'])]
        self.tmp_fp1 = get_tmp_filename(prefix='FormatTests_',suffix='.txt')
        self.tmp_fp2 = get_tmp_filename(prefix='FormatTests_',suffix='.txt')
        self.files_to_remove = []

        self.add_taxa_summary = {'s1':[1,2],'s2':[3,4]}
        self.add_taxa_header = ['sample_id','foo','bar']
        self.add_taxa_order = ['a;b;c','d;e;f']
        self.add_taxa_mapping = [['s1','something1','something2'],
                                 ['s2','something3','something4'],
                                 ['s3','something5','something6']]
        self.biom1 = parse_biom_table(biom1.split('\n'))
        
        self.expected_formatted_html_no_errors_warnings =\
         expected_formatted_html_no_errors_warnings
        self.expected_formatted_html_errors =\
         expected_formatted_html_errors
        self.expected_formatted_html_warnings =\
         expected_formatted_html_warnings
        self.expected_formatted_html_data_nonloc_error =\
         expected_formatted_html_data_nonloc_error

        # For testing formatting of correlation vectors.
        self.corr_vec1 = [('S1', 'T1', 0.7777777777, 0, 0, 0, 0, (0.5, 1.0))]
        self.corr_vec2 = [('S1', 'T1', 0.7777777777, 0, 0, 0, 0, (0.5, 1.0)),
                          ('S2', 'T2', 0.1, 0.05, 0.15, 0.04, 0.12,
                           (-0.1, 0.2)),
                          ('S3', 'T3', 100.68, 0.9, 1, 1, 1, (-0.4, -0.2))]
        self.corr_vec3 = [('S1', 'T1', 0.7777777777, 0, 0, 0, 0, (None, None))]

        # For testing statistical method formatters.
        self.overview_dm = DistanceMatrix.parseDistanceMatrix(
                overview_dm_lines)
        self.overview_map = MetadataMap.parseMetadataMap(overview_map_lines)

        self.soils_dm = DistanceMatrix.parseDistanceMatrix(soils_dm_lines)
        self.soils_map = MetadataMap.parseMetadataMap(soils_map_lines)

        self.anosim_overview = Anosim(self.overview_map, self.overview_dm,
                                      'Treatment')
        self.permanova_overview = Permanova(self.overview_map,
                                            self.overview_dm, 'Treatment')
        self.best_overview = Best(self.overview_dm, self.overview_map, ['DOB'])
        self.best_88_soils = Best(self.soils_dm, self.soils_map,
                ['TOT_ORG_CARB', 'SILT_CLAY', 'ELEVATION',
                 'SOIL_MOISTURE_DEFICIT', 'CARB_NITRO_RATIO',
                 'ANNUAL_SEASON_TEMP', 'ANNUAL_SEASON_PRECPT', 'PH',
                 'CMIN_RATE', 'LONGITUDE', 'LATITUDE'])
示例#2
0
def choose_cluster_subsets(otu_table_f, map_f, category, num_total_samples):
    otu_table = parse_biom_table(otu_table_f)
    metadata_map = MetadataMap.parseMetadataMap(map_f)

    # Dirty... :(
    try:
        map_f.seek(0)
    except AttributeError:
        pass

    if num_total_samples > len(otu_table.SampleIds):
        raise InvalidSubsetSize("Too many total samples (%d) were specified "
                                "as a subset size. There are only %d total "
                                "samples to choose a subset from." %
                                (num_total_samples, len(otu_table.SampleIds)))

    category_map = defaultdict(list)
    for samp_id in metadata_map.SampleIds:
        # Mapping files can have more samples than OTU tables.
        if samp_id in otu_table.SampleIds:
            category_val = metadata_map.getCategoryValue(samp_id, category)
            category_map[category_val].append(samp_id)

    samp_ids_to_keep, extra_samps = _choose_items_from_clusters(
            category_map, otu_table.SampleIds, num_total_samples)
    samp_ids_to_keep.extend(extra_samps)

    assert len(samp_ids_to_keep) == num_total_samples, \
           "%d != %d" % (len(samp_ids_to_keep), num_total_samples)
    assert len(samp_ids_to_keep) == len(set(samp_ids_to_keep)), \
           "Duplicate sample IDs in subset"

    return (filter_samples_from_otu_table(otu_table, samp_ids_to_keep, 0, inf),
            filter_mapping_file_from_mapping_f(map_f, samp_ids_to_keep))
示例#3
0
def check_mapping_file_category(loaded_biom, mapping_fp, mapping_category,
                                subcategory_1, subcategory_2):
    #remove mapping file samples that are not in the input BIOM table
    with open(mapping_fp, 'U') as map_f:
        md_map = MetadataMap.parseMetadataMap(map_f)
    md_map.filterSamples(loaded_biom.ids(axis='sample'), strict=True)

    if mapping_category not in md_map.CategoryNames:
        raise ValueError("category '%s' not found in mapping file "
                         "columns." % mapping_category)

    all_subcategories = md_map.getCategoryValues(md_map.sample_ids,
                                                 mapping_category)

    if subcategory_1 not in all_subcategories:
        raise ValueError("subcategory_1 (-x) '%s' not found in selected "
                         "mapping file column." % subcategory_1)

    if subcategory_2 not in all_subcategories:
        raise ValueError("subcategory_2 (-y) '%s' not found in selected "
                         "mapping file column." % subcategory_2)

    if subcategory_2 == subcategory_1:
        raise ValueError(
            "subcategory_1 (-x) must be different from subcategory_2 (-y)")
示例#4
0
def _generate_alpha_diversity_boxplots(collated_adiv_dir, map_fp,
                                       split_category, comparison_category,
                                       rarefaction_depth, output_dir):
    """Generates per-body-site self vs. other alpha diversity boxplots.

    Creates a plot for each input collated alpha diversity file (i.e. metric)
    in collated_adiv_dir. Returns a list of plot filenames that were created in
    output_dir.

    Arguments:
        collated_adiv_dir - path to directory containing one or more collated
            alpha diversity files
        map_fp - filepath to metadata mapping file
        split_category - category to split on, e.g. body site. A boxplot will
            be created for each category value (e.g. tongue, palm, etc.)
        comparison_category - category to split on within each of the split
            categories (e.g. self, other)
        rarefaction_depth - rarefaction depth to use when pulling data from
            rarefaction files
        output_dir - directory to write output plot images to
    """
    metadata_map = MetadataMap.parseMetadataMap(open(map_fp, 'U'))
    collated_adiv_fps = glob(join(collated_adiv_dir, '*.txt'))
    plot_title = 'Alpha diversity (%d seqs/sample)' % rarefaction_depth

    # Generate a plot for each collated alpha diversity metric file.
    created_files = []
    for collated_adiv_fp in collated_adiv_fps:
        adiv_metric = splitext(basename(collated_adiv_fp))[0]

        x_tick_labels, dists = _collect_alpha_diversity_boxplot_data(
                open(collated_adiv_fp, 'U'), metadata_map, rarefaction_depth,
                split_category, comparison_category)

        plot_figure = generate_box_plots(dists,
                                         x_tick_labels=x_tick_labels,
                                         title=plot_title,
                                         x_label='Grouping',
                                         y_label=format_title(adiv_metric))
        plot_fp = join(output_dir, '%s.png' % adiv_metric)
        plot_figure.savefig(plot_fp)
        created_files.append(basename(plot_fp))

    return created_files
示例#5
0
def subset_groups(dm_f, map_f, category, max_group_size):
    dm_labels, dm_data = parse_distmat(dm_f)
    metadata_map = MetadataMap.parseMetadataMap(map_f)

    category_map = defaultdict(list)
    for samp_id in metadata_map.SampleIds:
        # Mapping files can have more samples than distance matrices, which can
        # happen in this case since we are dealing with rarefied OTU tables
        # (samples get dropped).
        if samp_id in dm_labels:
            category_val = metadata_map.getCategoryValue(samp_id, category)
            category_map[category_val].append(samp_id)

    samp_ids_to_keep = []
    for category_val, samp_ids in category_map.items():
        samp_ids_to_keep.extend(
                sample(samp_ids, min(max_group_size, len(samp_ids))))

    return filter_samples_from_distance_matrix((dm_labels, dm_data),
                                               samp_ids_to_keep, negate=True)
def check_mapping_file_category(loaded_biom, mapping_fp, mapping_category, subcategory_1, subcategory_2):
    #remove mapping file samples that are not in the input BIOM table
    with open(mapping_fp, 'U') as map_f:
        md_map = MetadataMap.parseMetadataMap(map_f)
    md_map.filterSamples(loaded_biom.ids(axis='sample'), strict=True)

    if mapping_category not in md_map.CategoryNames:
        raise ValueError("category '%s' not found in mapping file "
                         "columns." % mapping_category)

    all_subcategories = md_map.getCategoryValues(md_map.sample_ids, mapping_category)

    if subcategory_1 not in all_subcategories:
        raise ValueError("subcategory_1 (-x) '%s' not found in selected "
                         "mapping file column." % subcategory_1)

    if subcategory_2 not in all_subcategories:
        raise ValueError("subcategory_2 (-y) '%s' not found in selected "
                         "mapping file column." % subcategory_2)

    if subcategory_2 == subcategory_1:
        raise ValueError("subcategory_1 (-x) must be different from subcategory_2 (-y)")
示例#7
0
def sample_ids_from_category_state_coverage(mapping_f,
                                            coverage_category,
                                            subject_category,
                                            min_num_states=None,
                                            required_states=None,
                                            considered_states=None,
                                            splitter_category=None):
    """Filter sample IDs based on subject's coverage of a category.

    Given a category that groups samples by subject (subject_category), samples
    are filtered by how well a subject covers (i.e. has at least one sample
    for) the category states in coverage_category.

    Two filtering criteria are provided (min_num_states and required_states). At
    least one must be provided. If both are provided, the subject must meet
    both criteria to pass the filter (i.e. providing both filters is an AND,
    not an OR, operation).

    A common use case is to provide a 'time' category for coverage_category and
    an 'individual' category for subject_category in order to filter out
    individuals from a study that do not have samples for some minimum number
    of timepoints (min_num_states) and that do not have samples for certain
    timepoints (required_states). For example, this could be the first and last
    timepoints in the study.

    Returns a set of sample IDs to keep, the number of subjects that were
    kept, and a set of the unique category states in coverage_category that
    were kept. The set of sample IDs is not guaranteed to be in any specific
    order relative to the order of sample IDs or subjects in the mapping file.

    Arguments:
        mapping_f - metadata mapping file (file-like object)
        coverage_category - category to test subjects' coverage (string)
        subject_category - category to group samples by subject (string)
        min_num_states - minimum number of category states in coverage_category
            that a subject must cover (i.e. have at least one sample for) to be
            included in results (integer)
        required_states - category states in coverage_category that must be
            covered by a subject's samples in order to be included in results
            (list of strings or items that can be converted to strings)
        considered_states - category states that are counted toward the 
            min_num_states (list of strings or items that can be converted to
            strings)
        splitter_category - category to split input mapping file on prior to
            processing. If not supplied, the mapping file will not be split. If
            supplied, a dictionary mapping splitter_category state to results
            will be returned instead of the three-element tuple. The supplied
            filtering criteria will apply to each split piece of the mapping
            file independently (e.g. if an individual passes the filters for
            the tongue samples, his/her tongue samples will be included for
            the tongue results, even if he/she doesn't pass the filters for the 
            palm samples)
    """
    metadata_map = MetadataMap.parseMetadataMap(mapping_f)

    # Make sure our input looks sane.
    categories_to_test = [coverage_category, subject_category]
    if splitter_category is not None:
        categories_to_test.append(splitter_category)

    if 'SampleID' in categories_to_test:
        raise ValueError("The 'SampleID' category is not suitable for use in "
                         "this function. Please choose a different category "
                         "from the metadata mapping file.")

    for category in categories_to_test:
        if category not in metadata_map.CategoryNames:
            raise ValueError("The category '%s' is not in the metadata "
                             "mapping file." % category)

    if len(set(categories_to_test)) < len(categories_to_test):
        raise ValueError("The coverage, subject, and (optional) splitter "
                         "categories must all be unique.")

    if required_states is not None:
        # required_states must be in coverage_category's states in the mapping
        # file.
        required_states = set(map(str,required_states))
        valid_coverage_states = set(metadata_map.getCategoryValues(
            metadata_map.SampleIds, coverage_category))
        invalid_coverage_states = required_states - valid_coverage_states

        if invalid_coverage_states:
            raise ValueError("The category state(s) '%s' are not in the '%s' "
                             "category in the metadata mapping file." %
                             (', '.join(invalid_coverage_states),
                              coverage_category))

    if considered_states is not None:
        # considered_states is not as restrictive as required_states - we don't 
        # require that these are present, so it's OK if some of the states
        # listed here don't actually show up in the mapping file (allowing
        # the user to pass something like range(100) to consider only states
        # that fall in some range)
        considered_states = set(map(str,considered_states))
        # define a function to determine if a state should be considered
        consider_state = lambda s: s in considered_states
    else:
        # define a dummy function to consider all states (the default
        # if the user does not provide a list of considered_states)
        consider_state = lambda s: True

    if min_num_states is None and required_states is None:
        raise ValueError("You must specify either the minimum number of "
                         "category states the subject must have samples for "
                         "(min_num_states), or the minimal category states "
                         "the subject must have samples for (required_states), "
                         "or both. Supplying neither filtering criteria is "
                         "not supported.")

    if splitter_category is None:
        results = _filter_sample_ids_from_category_state_coverage(
                metadata_map, metadata_map.SampleIds, coverage_category,
                subject_category, consider_state, min_num_states,
                required_states)
    else:
        # "Split" the metadata mapping file by extracting only sample IDs that
        # match the current splitter category state and using those for the
        # actual filtering.
        splitter_category_states = defaultdict(list)
        for samp_id in metadata_map.SampleIds:
            splitter_category_state = \
                    metadata_map.getCategoryValue(samp_id, splitter_category)
            splitter_category_states[splitter_category_state].append(samp_id)

        results = {}
        for splitter_category_state, sample_ids in \
            splitter_category_states.items():
            results[splitter_category_state] = \
                    _filter_sample_ids_from_category_state_coverage(
                            metadata_map, sample_ids, coverage_category,
                            subject_category, consider_state, min_num_states,
                            required_states)

    return results
示例#8
0
def _color_field_states(map_f, samp_ids, field, field_states, color_by_field):
    """Colors one field by another.

    Returns a list of matplotlib-compatible colors, one for each of the input
    field_states. Also returns a dictionary mapping color_by_field states to
    colors (useful for building a legend, for example).

    If there are not enough colors available (they are drawn from
    qiime.colors.data_colors), an error will be raised as the color mapping
    (and legend) will be ambiguous.

    A one-to-one mapping must exist between each field_state and its
    corresponding color_by field state (otherwise it is unclear which
    corresponding color_by field state should be used to color it by). An error
    will be raised if this one-to-one mapping does not exist.

    Arguments:
        map_f - the mapping file (file-like object)
        samp_ids - a list of sample IDs to consider in the mapping file. Only
            these sample IDs will be used when coloring field states
        field - the field in the mapping file to color
        field_states - the field states in field to color
        color_by_field - the field in the mapping file to color field_states by
    """
    colors = []
    color_pool = [
        matplotlib_rgb_color(data_colors[color].toRGB())
        for color in data_color_order
    ]
    metadata_map = MetadataMap.parseMetadataMap(map_f)

    for field_to_check in field, color_by_field:
        if field_to_check not in metadata_map.CategoryNames:
            raise ValueError("The field '%s' is not in the metadata mapping "
                             "file's column headers." % field_to_check)

    all_field_states = metadata_map.getCategoryValues(samp_ids, field)
    all_color_by_states = metadata_map.getCategoryValues(
        samp_ids, color_by_field)

    if len(set(field_states) - set(all_field_states)) != 0:
        raise ValueError("Encountered unrecognizable field state(s) in %r "
                         "for field '%s'." % (field_states, field))

    # Build mapping from one field to the other.
    field_mapping = defaultdict(list)
    for field_state, color_by_state in zip(all_field_states,
                                           all_color_by_states):
        if field_state in field_states:
            field_mapping[field_state].append(color_by_state)

    # For each of the specified input field states, find its corresponding
    # "color by" field state and give it a color if it hasn't been assigned one
    # yet. Make sure we have enough colors and there is a one-to-one mapping.
    color_mapping = {}
    for field_state in field_states:
        color_by_states = set(field_mapping[field_state])

        if len(color_by_states) != 1:
            raise ValueError("The field '%s' to color by does not have a "
                             "one-to-one mapping with field '%s'. Coloring "
                             "would be ambiguous." % (color_by_field, field))

        color_by_state = list(color_by_states)[0]
        if color_by_state not in color_mapping:
            if len(color_pool) > 0:
                color_mapping[color_by_state] = color_pool.pop(0)
            else:
                raise ValueError("There are not enough available QIIME colors "
                                 "to color each of the field states in field "
                                 "'%s'. Coloring would be ambiguous." %
                                 color_by_field)

        colors.append(color_mapping[color_by_state])

    return colors, color_mapping
示例#9
0
def compare_categories(dm_fp, map_fp, method, categories, num_perms, out_dir):
    """Runs the specified statistical method using the category of interest.

    This method does not return anything; all output is written to results
    files in out_dir.

    Arguments:
        dm_fp - filepath to the input distance matrix
        map_fp - filepath to the input metadata mapping file
        categories - list of categories in the metadata mapping file to
            consider in the statistical test. Multiple categories will only be
            considered if method is 'best', otherwise only the first category
            will be considered
        num_perms - the number of permutations to use when calculating the
            p-value. If method is 'best' or 'morans_i', this parameter will be
            ignored as they are not permutation-based methods
        out_dir - path to the output directory where results files will be
            written. It is assumed that this directory already exists and we
            have write permissions to it
    """

    # Make sure we were passed a list of categories, not a single string.
    if not isinstance(categories, ListType):
        raise TypeError("The supplied categories must be a list of "
                        "strings.")

    # Special case: we do not allow SampleID as it is not a category, neither
    # in data structure representation nor in terms of a statistical test (no
    # groups are formed since all entries are unique IDs).
    if 'SampleID' in categories:
        raise ValueError("Cannot use SampleID as a category because it is a "
                         "unique identifier for each sample, and thus does "
                         "not create groups of samples (nor can it be used as "
                         "a numeric category in Moran's I or BEST analyses). "
                         "Please use a different metadata column to perform "
                         "statistical tests on.")

    # Parse the mapping file and distance matrix.
    md_map = MetadataMap.parseMetadataMap(open(map_fp, 'U'))
    dm = DistanceMatrix.parseDistanceMatrix(open(dm_fp, 'U'))

    # Remove any samples from the mapping file that aren't in the distance
    # matrix (important for validation checks). Use strict=True so that an
    # error is raised if the distance matrix contains any samples that aren't
    # in the mapping file.
    md_map.filterSamples(dm.SampleIds, strict=True)

    # Run the specified statistical method.
    if method in ['adonis', 'morans_i', 'mrpp', 'permdisp', 'dbrda']:
        # These methods are run in R. Input validation must be done here before
        # running the R commands. The pure-Python implementations perform all
        # validation in the classes in the stats module.

        # Make sure the input distance matrix is symmetric and hollow.
        if not dm.is_symmetric_and_hollow():
            raise ValueError("The distance matrix must be symmetric and "
                             "hollow.")

        # Check to make sure all categories passed in are in mapping file and
        # are not all the same value.
        for category in categories:
            if not category in md_map.CategoryNames:
                raise ValueError("Category '%s' not found in mapping file "
                                 "columns." % category)

            if md_map.hasSingleCategoryValue(category):
                raise ValueError("All values in category '%s' are the "
                                 "same. The statistical method '%s' cannot "
                                 "operate on a category that creates only "
                                 "a single group of samples (e.g. there "
                                 "are no 'between' distances because "
                                 "there is only a single group)." %
                                 (category, method))

        # Build the command arguments string.
        command_args = [
            '-d %s -m %s -c %s -o %s' % (dm_fp, map_fp, categories[0], out_dir)
        ]

        if method == 'morans_i':
            # Moran's I requires only numeric categories.
            for category in categories:
                if not md_map.isNumericCategory(category):
                    raise TypeError(
                        "The category '%s' is not numeric. Not "
                        "all values could be converted to numbers." % category)
        else:
            # The rest require groups of samples, so the category values cannot
            # all be unique.
            for category in categories:
                if md_map.hasUniqueCategoryValues(category):
                    raise ValueError("All values in category '%s' are unique. "
                                     "This statistical method cannot operate "
                                     "on a category with unique values (e.g. "
                                     "there are no 'within' distances because "
                                     "each group of samples contains only a "
                                     "single sample)." % category)

            # Only Moran's I doesn't accept a number of permutations.
            if num_perms < 0:
                raise ValueError("The number of permutations must be greater "
                                 "than or equal to zero.")

            command_args[0] += ' -n %d' % num_perms

        rex = RExecutor(TmpDir=get_qiime_temp_dir())
        rex(command_args, '%s.r' % method, output_dir=out_dir)
    elif method == 'anosim':
        anosim = Anosim(md_map, dm, categories[0])
        anosim_results = anosim(num_perms)

        out_f = open(join(out_dir, '%s_results.txt' % method), 'w+')
        out_f.write(format_anosim_results(anosim_results))
        out_f.close()
    elif method == 'best':
        best = Best(dm, md_map, categories)
        best_results = best()

        out_f = open(join(out_dir, '%s_results.txt' % method), 'w+')
        out_f.write(format_best_results(best_results))
        out_f.close()
    elif method == 'permanova':
        permanova = Permanova(md_map, dm, categories[0])
        permanova_results = permanova(num_perms)

        out_f = open(join(out_dir, '%s_results.txt' % method), 'w+')
        out_f.write(format_permanova_results(permanova_results))
        out_f.close()
    else:
        raise ValueError("Unrecognized method '%s'. Valid methods: %r" %
                         (method, methods))
示例#10
0
def compare_categories(dm_fp, map_fp, method, categories, num_perms, out_dir):
    """Runs the specified statistical method using the category of interest.

    This method does not return anything; all output is written to results
    files in out_dir.

    Arguments:
        dm_fp - filepath to the input distance matrix
        map_fp - filepath to the input metadata mapping file
        categories - list of categories in the metadata mapping file to
            consider in the statistical test. Multiple categories will only be
            considered if method is 'best', otherwise only the first category
            will be considered
        num_perms - the number of permutations to use when calculating the
            p-value. If method is 'best' or 'morans_i', this parameter will be
            ignored as they are not permutation-based methods
        out_dir - path to the output directory where results files will be
            written. It is assumed that this directory already exists and we
            have write permissions to it
    """

    # Make sure we were passed a list of categories, not a single string.
    if not isinstance(categories, ListType):
        raise TypeError("The supplied categories must be a list of "
                        "strings.")

    # Special case: we do not allow SampleID as it is not a category, neither
    # in data structure representation nor in terms of a statistical test (no
    # groups are formed since all entries are unique IDs).
    if 'SampleID' in categories:
        raise ValueError("Cannot use SampleID as a category because it is a "
                         "unique identifier for each sample, and thus does "
                         "not create groups of samples (nor can it be used as "
                         "a numeric category in Moran's I or BEST analyses). "
                         "Please use a different metadata column to perform "
                         "statistical tests on.")

    # Parse the mapping file and distance matrix.
    with open(map_fp, 'U') as map_f:
        md_map = MetadataMap.parseMetadataMap(map_f)

    with open(dm_fp, 'U') as dm_f:
        dm = SymmetricDistanceMatrix.from_file(dm_f)

    # Remove any samples from the mapping file that aren't in the distance
    # matrix (important for validation checks). Use strict=True so that an
    # error is raised if the distance matrix contains any samples that aren't
    # in the mapping file.
    md_map.filterSamples(dm.sample_ids, strict=True)

    # Run the specified statistical method.
    if method in ['adonis', 'morans_i', 'mrpp', 'permdisp', 'dbrda']:
        # These methods are run in R. Input validation must be done here before
        # running the R commands. The pure-Python implementations perform all
        # validation in the classes in the stats module.

        # Check to make sure all categories passed in are in mapping file and
        # are not all the same value.
        for category in categories:
            if not category in md_map.CategoryNames:
                raise ValueError("Category '%s' not found in mapping file "
                                 "columns." % category)

            if md_map.hasSingleCategoryValue(category):
                raise ValueError("All values in category '%s' are the "
                                 "same. The statistical method '%s' cannot "
                                 "operate on a category that creates only "
                                 "a single group of samples (e.g. there "
                                 "are no 'between' distances because "
                                 "there is only a single group)."
                                 % (category, method))

        # Build the command arguments string.
        command_args = ['-d %s -m %s -c %s -o %s'
                        % (dm_fp, map_fp, categories[0], out_dir)]

        if method == 'morans_i':
            # Moran's I requires only numeric categories.
            for category in categories:
                if not md_map.isNumericCategory(category):
                    raise TypeError("The category '%s' is not numeric. Not "
                                    "all values could be converted to numbers."
                                    % category)
        else:
            # The rest require groups of samples, so the category values cannot
            # all be unique.
            for category in categories:
                if md_map.hasUniqueCategoryValues(category):
                    raise ValueError("All values in category '%s' are unique. "
                                     "This statistical method cannot operate "
                                     "on a category with unique values (e.g. "
                                     "there are no 'within' distances because "
                                     "each group of samples contains only a "
                                     "single sample)." % category)

            # Only Moran's I doesn't accept a number of permutations.
            if num_perms < 0:
                raise ValueError("The number of permutations must be greater "
                                 "than or equal to zero.")

            command_args[0] += ' -n %d' % num_perms

        rex = RExecutor(TmpDir=get_qiime_temp_dir())
        rex(command_args, '%s.r' % method, output_dir=out_dir)
    elif method == 'anosim':
        anosim = Anosim(md_map, dm, categories[0])
        anosim_results = anosim(num_perms)

        out_f = open(join(out_dir, '%s_results.txt' % method), 'w+')
        out_f.write(format_anosim_results(anosim_results))
        out_f.close()
    elif method == 'best':
        best = Best(dm, md_map, categories)
        best_results = best()

        out_f = open(join(out_dir, '%s_results.txt' % method), 'w+')
        out_f.write(format_best_results(best_results))
        out_f.close()
    elif method == 'permanova':
        permanova = Permanova(md_map, dm, categories[0])
        permanova_results = permanova(num_perms)

        out_f = open(join(out_dir, '%s_results.txt' % method), 'w+')
        out_f.write(format_permanova_results(permanova_results))
        out_f.close()
    else:
        raise ValueError("Unrecognized method '%s'. Valid methods: %r"
                         % (method, methods))
示例#11
0
def compare_categories(dm_fp, map_fp, method, categories, num_perms, out_dir):
    """Runs the specified statistical method using the category of interest.

    This method does not return anything; all output is written to results
    files in out_dir.

    Arguments:
        dm_fp - filepath to the input distance matrix
        map_fp - filepath to the input metadata mapping file
        categories - list of categories in the metadata mapping file to
            consider in the statistical test. Multiple categories will only be
            considered if method is 'bioenv', otherwise only the first category
            will be considered
        num_perms - the number of permutations to use when calculating the
            p-value. If method is 'bioenv' or 'morans_i', this parameter will
            be ignored as they are not permutation-based methods
        out_dir - path to the output directory where results files will be
            written. It is assumed that this directory already exists and we
            have write permissions to it
    """
    # Make sure we were passed a list of categories, not a single string.
    if not isinstance(categories, ListType):
        raise TypeError("The supplied categories must be a list of "
                        "strings.")

    # Special case: we do not allow SampleID as it is not a category, neither
    # in data structure representation nor in terms of a statistical test (no
    # groups are formed since all entries are unique IDs).
    if 'SampleID' in categories:
        raise ValueError("Cannot use SampleID as a category because it is a "
                         "unique identifier for each sample, and thus does "
                         "not create groups of samples (nor can it be used as "
                         "a numeric category in Moran's I or BIO-ENV "
                         "analyses). Please choose a different metadata "
                         "column to perform statistical tests on.")

    dm = DistanceMatrix.read(dm_fp)

    if method in ('anosim', 'permanova', 'bioenv'):
        with open(map_fp, 'U') as map_f:
            md_dict = parse_mapping_file_to_dict(map_f)[0]
        df = pd.DataFrame.from_dict(md_dict, orient='index')

        out_fp = join(out_dir, '%s_results.txt' % method)

        if method in ('anosim', 'permanova'):
            if method == 'anosim':
                method_cls = ANOSIM
            elif method == 'permanova':
                method_cls = PERMANOVA

            method_inst = method_cls(dm, df, column=categories[0])
            results = method_inst(num_perms)

            with open(out_fp, 'w') as out_f:
                out_f.write(results.summary())
        elif method == 'bioenv':
            results = bioenv(dm, df, columns=categories)
            results.to_csv(out_fp, sep='\t')
    else:
        # Remove any samples from the mapping file that aren't in the distance
        # matrix (important for validation checks). Use strict=True so that an
        # error is raised if the distance matrix contains any samples that
        # aren't in the mapping file.
        with open(map_fp, 'U') as map_f:
            md_map = MetadataMap.parseMetadataMap(map_f)
        md_map.filterSamples(dm.ids, strict=True)

        # These methods are run in R. Input validation must be done here before
        # running the R commands.
        if method in ['adonis', 'morans_i', 'mrpp', 'permdisp', 'dbrda']:
            # Check to make sure all categories passed in are in mapping file
            # and are not all the same value.
            for category in categories:
                if not category in md_map.CategoryNames:
                    raise ValueError("Category '%s' not found in mapping file "
                                     "columns." % category)

                if md_map.hasSingleCategoryValue(category):
                    raise ValueError("All values in category '%s' are the "
                                     "same. The statistical method '%s' "
                                     "cannot operate on a category that "
                                     "creates only a single group of samples "
                                     "(e.g. there are no 'between' distances "
                                     "because there is only a single group)."
                                     % (category, method))

            # Build the command arguments string.
            command_args = ['-d %s -m %s -c %s -o %s'
                            % (dm_fp, map_fp, categories[0], out_dir)]

            if method == 'morans_i':
                # Moran's I requires only numeric categories.
                for category in categories:
                    if not md_map.isNumericCategory(category):
                        raise TypeError("The category '%s' is not numeric. "
                                        "Not all values could be converted to "
                                        "numbers." % category)
            else:
                # The rest require groups of samples, so the category values
                # cannot all be unique.
                for category in categories:
                    if md_map.hasUniqueCategoryValues(category):
                        raise ValueError("All values in category '%s' are "
                                         "unique. This statistical method "
                                         "cannot operate on a category with "
                                         "unique values (e.g. there are no "
                                         "'within' distances because each "
                                         "group of samples contains only a "
                                         "single sample)." % category)

                # Only Moran's I doesn't accept a number of permutations.
                if num_perms < 0:
                    raise ValueError("The number of permutations must be "
                                     "greater than or equal to zero.")

                command_args[0] += ' -n %d' % num_perms

            rex = RExecutor(TmpDir=get_qiime_temp_dir())
            rex(command_args, '%s.r' % method, output_dir=out_dir)
        else:
            raise ValueError("Unrecognized method '%s'. Valid methods: %r"
                             % (method, methods))
示例#12
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    try:
        if not path.exists(opts.output_dir):
            create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")

    # Parse the mapping file and distance matrix.
    md_map = MetadataMap.parseMetadataMap(open(opts.mapping_file,'U'))
    dm = DistanceMatrix.parseDistanceMatrix(open(opts.input_dm,'U'))

    # Separate all categories into a list, then grab the first category.
    categories = opts.categories.split(',')

    # Cursory check to make sure all categories passed in are in mapping file.
    maps = parse_mapping_file(open(opts.mapping_file,'U').readlines())
    for category in categories:
        if not category in maps[1][1:]:
            option_parser.error("Category '%s' not found in mapping file "
                                "columns:" % category)

    # Make sure the input distance matrix is symmetric and hollow. Must check
    # here before allowing R to use it, as R will silently ignore the diagonal
    # and upper triangle of the distance matrix.
    if not dm.is_symmetric_and_hollow():
        option_parser.error("The distance matrix must be symmetric and "
                            "hollow.")

    # Figure out which method we need to run.
    if opts.method == 'adonis':
        command_args = ["-d " + opts.input_dm + " -m " + opts.mapping_file + \
            " -c " + categories[0] + " -o " + opts.output_dir + " -n " + \
            str(opts.num_permutations)]
        rex = RExecutor()
        rex(command_args, "adonis.r", output_dir=opts.output_dir)
    elif opts.method == 'anosim':
        anosim = Anosim(md_map, dm, categories[0])
        anosim_results = anosim(opts.num_permutations)

        output_file = open(opts.output_dir + "/" + opts.method + \
            "_results.txt", "w+")
        output_file.write("Method Name\tR-value\tP-value")
        output_file.write("\n")
        output_file.write(anosim_results["method_name"]+"\t"+\
            str(anosim_results["r_value"])+"\t"+\
            str(anosim_results["p_value"])+"\t")
        output_file.write("\n")
        output_file.close()
    elif opts.method == 'best':
        bioenv = BioEnv(dm, md_map, categories)
        bioenv_results = bioenv()

        output_file = open(opts.output_dir+"/best_results.txt", 'w+')
        output_file.write("Method Name:\tNum_Vars:\t")
        output_file.write("\n")
        output_file.write(bioenv_results["method_name"]+"\t"+\
            str(bioenv_results["num_vars"]) + "\t")
        output_file.write("\n")
        output_file.write("Variables:\t")
        output_file.write("\n")
        for variable in bioenv_results["vars"]:
            output_file.write(str(variable) + "\t")
        output_file.write("\n")
        output_file.write("RHO_Values:\t")
        output_file.write("\n")
        for rho_val in bioenv_results["bioenv_rho_vals"]:
            output_file.write(str(rho_val) + "\t")
        output_file.write("\n")
        output_file.close()
    elif opts.method == 'morans_i':
        command_args = ["-i " + opts.input_dm + " -m " + opts.mapping_file + \
            " -c " + categories[0] + " -o " + opts.output_dir]
        rex = RExecutor()
        rex(command_args, "morans_i.r", output_dir=opts.output_dir)
    elif opts.method == 'mrpp':
        command_args = ["-d " + opts.input_dm + " -m " + opts.mapping_file + \
            " -c " + categories[0] + " -o " + opts.output_dir + \
            " -n " + str(opts.num_permutations)]
        rex = RExecutor()
        rex(command_args, "mrpp.r", output_dir=opts.output_dir)
    elif opts.method == 'permanova':
        permanova_plain = Permanova(md_map, dm, categories[0])
        permanova_results = permanova_plain(opts.num_permutations)

        output_file = open(opts.output_dir+"/permanova_results.txt", 'w+')
        output_file.write("Method Name\tF-value\tP-value")
        output_file.write("\n")
        output_file.write(permanova_results["method_name"]+"\t"+\
            str(permanova_results["f_value"]) + "\t" + \
            format_p_value_for_num_iters(permanova_results["p_value"], \
            opts.num_permutations)+"\t")
        output_file.write("\n")
        output_file.close()
    elif opts.method == 'permdisp':
        command_args = ["-d " + opts.input_dm + " -m " + opts.mapping_file + \
            " -c " + categories[0] + " -o " + opts.output_dir + " -n " + \
            str(opts.num_permutations)]
        rex = RExecutor()
        rex(command_args, "permdisp.r", output_dir=opts.output_dir)
    elif opts.method == 'dbrda':
        command_args = ["-i " + opts.input_dm + " -m " + opts.mapping_file + \
            " -c " + categories[0] + " -o " + opts.output_dir + " -n " + \
            str(opts.num_permutations)]
        rex = RExecutor()
        rex(command_args, "dbrda.r", output_dir=opts.output_dir)
示例#13
0
文件: filter.py 项目: qinjunjie/qiime
def sample_ids_from_category_state_coverage(
    mapping_f, coverage_category, subject_category, min_num_states=None, required_states=None, considered_states=None
):
    """Filter sample IDs based on subject's coverage of a category.

    Given a category that groups samples by subject (subject_category), samples
    are filtered by how well a subject covers (i.e. has at least one sample
    for) the category states in coverage_category.

    Two filtering criteria are provided (min_num_states and required_states). At
    least one must be provided. If both are provided, the subject must meet
    both criteria to pass the filter (i.e. providing both filters is an AND,
    not an OR, operation).

    A common use case is to provide a 'time' category for coverage_category and
    an 'individual' category for subject_category in order to filter out
    individuals from a study that do not have samples for some minimum number
    of timepoints (min_num_states) and that do not have samples for certain
    timepoints (required_states). For example, this could be the first and last
    timepoints in the study.

    Returns a list of sample IDs to keep, the number of subjects that were
    kept, and the number of unique category states in coverage_category that
    were kept. The list of sample IDs is not guaranteed to be in any specific
    order relative to the order of sample IDs or subjects in the mapping file.

    Arguments:
        mapping_f - metadata mapping file (file-like object)
        coverage_category - category to test subjects' coverage (string)
        subject_category - category to group samples by subject (string)
        min_num_states - minimum number of category states in coverage_category
            that a subject must cover (i.e. have at least one sample for) to be
            included in results (integer)
        required_states - category states in coverage_category that must be
            covered by a subject's samples in order to be included in results
            (list of strings)
        considered_states - category states that are counted toward the 
            min_num_states (list of strings)
    """
    metadata_map = MetadataMap.parseMetadataMap(mapping_f)

    # Make sure out input looks sane.
    if coverage_category == "SampleID" or subject_category == "SampleID":
        raise ValueError(
            "The 'SampleID' category is not suitable for use in "
            "this function. Please choose a different category "
            "from the metadata mapping file."
        )

    if coverage_category not in metadata_map.CategoryNames:
        raise ValueError("The coverage category '%s' is not in the metadata " "mapping file." % coverage_category)

    if subject_category not in metadata_map.CategoryNames:
        raise ValueError("The subject category '%s' is not in the metadata " "mapping file." % subject_category)

    if required_states is not None:
        # required_states must be in coverage_category's states in the mapping
        # file.
        required_states = set(map(str, required_states))
        valid_coverage_states = set(metadata_map.getCategoryValues(metadata_map.SampleIds, coverage_category))
        invalid_coverage_states = required_states - valid_coverage_states

        if invalid_coverage_states:
            raise ValueError(
                "The category state(s) '%s' are not in the '%s' "
                "category in the metadata mapping file." % (", ".join(invalid_coverage_states), coverage_category)
            )

    if considered_states is not None:
        # considered_states is not as restrictive as required_states - we don't
        # require that these are present, so it's OK if some of the states
        # listed here don't actually show up in the mapping file (allowing
        # the user to pass something like range(100) to consider only states
        # that fall in some range)
        considered_states = set(map(str, considered_states))
        # define a function to determine if a state should be considered
        consider_state = lambda s: s in considered_states
    else:
        # define a dummy function to consider all states (the default
        # if the user does not provide a list of considered_states)
        consider_state = lambda s: True

    if min_num_states is None and required_states is None:
        raise ValueError(
            "You must specify either the minimum number of "
            "category states the subject must have samples for "
            "(min_num_states), or the minimal category states "
            "the subject must have samples for (required_states), "
            "or both. Supplying neither filtering criteria is "
            "not supported."
        )

    # Build mapping from subject to sample IDs.
    subjects = defaultdict(list)
    for samp_id in metadata_map.SampleIds:
        subject = metadata_map.getCategoryValue(samp_id, subject_category)
        subjects[subject].append(samp_id)

    # Perform filtering.
    samp_ids_to_keep = []
    num_subjects_kept = 0
    states_kept = []
    for subject, samp_ids in subjects.items():
        subject_covered_states = set(metadata_map.getCategoryValues(samp_ids, coverage_category))

        # Short-circuit evaluation of ANDing filters.
        keep_subject = True
        if min_num_states is not None:
            # note: when summing a list of boolean values, True == 1 and False == 0
            if sum([consider_state(s) for s in subject_covered_states]) < min_num_states:
                keep_subject = False
        if keep_subject and required_states is not None:
            if len(subject_covered_states & required_states) != len(required_states):
                keep_subject = False

        if keep_subject:
            samp_ids_to_keep.extend(samp_ids)
            states_kept.extend(subject_covered_states)
            num_subjects_kept += 1

    return samp_ids_to_keep, num_subjects_kept, len(set(states_kept))
示例#14
0
    def setUp(self):
        """Define some sample data that will be used by the tests."""
        # The prefix to use for temporary files. This prefix may be added to,
        # but all temp dirs and files created by the tests will have this
        # prefix at a minimum.
        self.prefix = 'my_microbes_tests_'

        self.start_dir = getcwd()
        self.dirs_to_remove = []
        self.files_to_remove = []

        self.tmp_dir = get_qiime_temp_dir()
        if not exists(self.tmp_dir):
            makedirs(self.tmp_dir)
            # If test creates the temp dir, also remove it.
            self.dirs_to_remove.append(self.tmp_dir)

        # Set up temporary input and output directories.
        self.input_dir = mkdtemp(dir=self.tmp_dir,
                                  prefix='%sinput_dir_' % self.prefix)
        self.dirs_to_remove.append(self.input_dir)

        self.output_dir = mkdtemp(dir=self.tmp_dir,
                                  prefix='%soutput_dir_' % self.prefix)
        self.dirs_to_remove.append(self.output_dir)

        # Data that will be used by the tests.
        self.metadata_map = MetadataMap.parseMetadataMap(
                mapping_str.split('\n'))
        self.mapping_data, self.mapping_header = parse_mapping_file(
                mapping_str.split('\n'))[:2]

        self.mapping_fp = join(self.input_dir, 'map.txt')
        mapping_f = open(self.mapping_fp, 'w')
        mapping_f.write(mapping_str)
        mapping_f.close()
        self.files_to_remove.append(self.mapping_fp)

        self.personal_metadata_map = MetadataMap.parseMetadataMap(
                personal_mapping_str.split('\n'))
        self.personal_mapping_data = parse_mapping_file(
                personal_mapping_str.split('\n'))[0]

        self.rarefaction_lines = collated_alpha_div_str.split('\n')
        self.na_rarefaction_lines = collated_alpha_div_na_str.split('\n')

        self.rarefaction_dir = join(self.input_dir, 'collated_adiv')
        create_dir(self.rarefaction_dir)
        self.rarefaction_fp = join(self.rarefaction_dir, 'PD_whole_tree.txt')
        rarefaction_f = open(self.rarefaction_fp, 'w')
        rarefaction_f.write(collated_alpha_div_str)
        rarefaction_f.close()
        self.files_to_remove.append(self.rarefaction_fp)

        self.coord_fp = join(self.input_dir, 'coord.txt')
        coord_f = open(self.coord_fp, 'w')
        coord_f.write(coord_str)
        coord_f.close()
        self.files_to_remove.append(self.coord_fp)

        self.otu_table_fp = join(self.input_dir, 'otu_table.biom')
        otu_table_f = open(self.otu_table_fp, 'w')
        otu_table_f.write(otu_table_str)
        otu_table_f.close()
        self.files_to_remove.append(self.otu_table_fp)

        self.prefs_fp = join(self.input_dir, 'prefs.txt')
        prefs_f = open(self.prefs_fp, 'w')
        prefs_f.write(prefs_str)
        prefs_f.close()
        self.files_to_remove.append(self.prefs_fp)

        self.recipients = ["# a comment", " ", " foo1\[email protected]  ",
                            "foo2\t [email protected],  [email protected],[email protected] "]

        self.email_settings = ["# A comment", "# Another comment",
                "smtp_server\tsome.smtp.server", "smtp_port\t42",
                "sender\[email protected]", "password\t424242!"]
示例#15
0
def compare_categories(dm_fp, map_fp, method, categories, num_perms, out_dir):
    """Runs the specified statistical method using the category of interest.

    This method does not return anything; all output is written to results
    files in out_dir.

    Arguments:
        dm_fp - filepath to the input distance matrix
        map_fp - filepath to the input metadata mapping file
        categories - list of categories in the metadata mapping file to
            consider in the statistical test. Multiple categories will only be
            considered if method is 'bioenv', otherwise only the first category
            will be considered
        num_perms - the number of permutations to use when calculating the
            p-value. If method is 'bioenv' or 'morans_i', this parameter will
            be ignored as they are not permutation-based methods
        out_dir - path to the output directory where results files will be
            written. It is assumed that this directory already exists and we
            have write permissions to it
    """
    # Make sure we were passed a list of categories, not a single string.
    if not isinstance(categories, ListType):
        raise TypeError("The supplied categories must be a list of "
                        "strings.")

    # Special case: we do not allow SampleID as it is not a category, neither
    # in data structure representation nor in terms of a statistical test (no
    # groups are formed since all entries are unique IDs).
    if 'SampleID' in categories:
        raise ValueError("Cannot use SampleID as a category because it is a "
                         "unique identifier for each sample, and thus does "
                         "not create groups of samples (nor can it be used as "
                         "a numeric category in Moran's I or BIO-ENV "
                         "analyses). Please choose a different metadata "
                         "column to perform statistical tests on.")

    dm = DistanceMatrix.read(dm_fp)

    if method in ('anosim', 'permanova', 'bioenv'):
        with open(map_fp, 'U') as map_f:
            md_dict = parse_mapping_file_to_dict(map_f)[0]
        df = pd.DataFrame.from_dict(md_dict, orient='index')

        out_fp = join(out_dir, '%s_results.txt' % method)

        if method in ('anosim', 'permanova'):
            if method == 'anosim':
                method_fn = anosim
            elif method == 'permanova':
                method_fn = permanova

            results = method_fn(dm,
                                df,
                                column=categories[0],
                                permutations=num_perms)
        elif method == 'bioenv':
            results = bioenv(dm, df, columns=categories)

        results.to_csv(out_fp, sep='\t')
    else:
        # Remove any samples from the mapping file that aren't in the distance
        # matrix (important for validation checks). Use strict=True so that an
        # error is raised if the distance matrix contains any samples that
        # aren't in the mapping file.
        with open(map_fp, 'U') as map_f:
            md_map = MetadataMap.parseMetadataMap(map_f)
        md_map.filterSamples(dm.ids, strict=True)

        # These methods are run in R. Input validation must be done here before
        # running the R commands.
        if method in ['adonis', 'morans_i', 'mrpp', 'permdisp', 'dbrda']:
            # Check to make sure all categories passed in are in mapping file
            # and are not all the same value.
            for category in categories:
                if not category in md_map.CategoryNames:
                    raise ValueError("Category '%s' not found in mapping file "
                                     "columns." % category)

                if md_map.hasSingleCategoryValue(category):
                    raise ValueError("All values in category '%s' are the "
                                     "same. The statistical method '%s' "
                                     "cannot operate on a category that "
                                     "creates only a single group of samples "
                                     "(e.g. there are no 'between' distances "
                                     "because there is only a single group)." %
                                     (category, method))

            # Build the command arguments string.
            command_args = [
                '-d %s -m %s -c %s -o %s' %
                (dm_fp, map_fp, categories[0], out_dir)
            ]

            if method == 'morans_i':
                # Moran's I requires only numeric categories.
                for category in categories:
                    if not md_map.isNumericCategory(category):
                        raise TypeError("The category '%s' is not numeric. "
                                        "Not all values could be converted to "
                                        "numbers." % category)
            else:
                # The rest require groups of samples, so the category values
                # cannot all be unique.
                for category in categories:
                    if (md_map.hasUniqueCategoryValues(category)
                            and not (method == 'adonis'
                                     and md_map.isNumericCategory(category))):
                        raise ValueError("All values in category '%s' are "
                                         "unique. This statistical method "
                                         "cannot operate on a category with "
                                         "unique values (e.g. there are no "
                                         "'within' distances because each "
                                         "group of samples contains only a "
                                         "single sample)." % category)

                # Only Moran's I doesn't accept a number of permutations.
                if num_perms < 0:
                    raise ValueError("The number of permutations must be "
                                     "greater than or equal to zero.")

                command_args[0] += ' -n %d' % num_perms

            rex = RExecutor(TmpDir=get_qiime_temp_dir())
            rex(command_args, '%s.r' % method)
        else:
            raise ValueError("Unrecognized method '%s'. Valid methods: %r" %
                             (method, methods))
示例#16
0
def sample_ids_from_category_state_coverage(mapping_f,
                                            coverage_category,
                                            subject_category,
                                            min_num_states=None,
                                            required_states=None,
                                            considered_states=None,
                                            splitter_category=None):
    """Filter sample IDs based on subject's coverage of a category.

    Given a category that groups samples by subject (subject_category), samples
    are filtered by how well a subject covers (i.e. has at least one sample
    for) the category states in coverage_category.

    Two filtering criteria are provided (min_num_states and required_states). At
    least one must be provided. If both are provided, the subject must meet
    both criteria to pass the filter (i.e. providing both filters is an AND,
    not an OR, operation).

    A common use case is to provide a 'time' category for coverage_category and
    an 'individual' category for subject_category in order to filter out
    individuals from a study that do not have samples for some minimum number
    of timepoints (min_num_states) and that do not have samples for certain
    timepoints (required_states). For example, this could be the first and last
    timepoints in the study.

    Returns a set of sample IDs to keep, the number of subjects that were
    kept, and a set of the unique category states in coverage_category that
    were kept. The set of sample IDs is not guaranteed to be in any specific
    order relative to the order of sample IDs or subjects in the mapping file.

    Arguments:
        mapping_f - metadata mapping file (file-like object)
        coverage_category - category to test subjects' coverage (string)
        subject_category - category to group samples by subject (string)
        min_num_states - minimum number of category states in coverage_category
            that a subject must cover (i.e. have at least one sample for) to be
            included in results (integer)
        required_states - category states in coverage_category that must be
            covered by a subject's samples in order to be included in results
            (list of strings or items that can be converted to strings)
        considered_states - category states that are counted toward the 
            min_num_states (list of strings or items that can be converted to
            strings)
        splitter_category - category to split input mapping file on prior to
            processing. If not supplied, the mapping file will not be split. If
            supplied, a dictionary mapping splitter_category state to results
            will be returned instead of the three-element tuple. The supplied
            filtering criteria will apply to each split piece of the mapping
            file independently (e.g. if an individual passes the filters for
            the tongue samples, his/her tongue samples will be included for
            the tongue results, even if he/she doesn't pass the filters for the 
            palm samples)
    """
    metadata_map = MetadataMap.parseMetadataMap(mapping_f)

    # Make sure our input looks sane.
    categories_to_test = [coverage_category, subject_category]
    if splitter_category is not None:
        categories_to_test.append(splitter_category)

    if 'SampleID' in categories_to_test:
        raise ValueError("The 'SampleID' category is not suitable for use in "
                         "this function. Please choose a different category "
                         "from the metadata mapping file.")

    for category in categories_to_test:
        if category not in metadata_map.CategoryNames:
            raise ValueError("The category '%s' is not in the metadata "
                             "mapping file." % category)

    if len(set(categories_to_test)) < len(categories_to_test):
        raise ValueError("The coverage, subject, and (optional) splitter "
                         "categories must all be unique.")

    if required_states is not None:
        # required_states must be in coverage_category's states in the mapping
        # file.
        required_states = set(map(str, required_states))
        valid_coverage_states = set(
            metadata_map.getCategoryValues(metadata_map.SampleIds,
                                           coverage_category))
        invalid_coverage_states = required_states - valid_coverage_states

        if invalid_coverage_states:
            raise ValueError(
                "The category state(s) '%s' are not in the '%s' "
                "category in the metadata mapping file." %
                (', '.join(invalid_coverage_states), coverage_category))

    if considered_states is not None:
        # considered_states is not as restrictive as required_states - we don't
        # require that these are present, so it's OK if some of the states
        # listed here don't actually show up in the mapping file (allowing
        # the user to pass something like range(100) to consider only states
        # that fall in some range)
        considered_states = set(map(str, considered_states))
        # define a function to determine if a state should be considered
        consider_state = lambda s: s in considered_states
    else:
        # define a dummy function to consider all states (the default
        # if the user does not provide a list of considered_states)
        consider_state = lambda s: True

    if min_num_states is None and required_states is None:
        raise ValueError(
            "You must specify either the minimum number of "
            "category states the subject must have samples for "
            "(min_num_states), or the minimal category states "
            "the subject must have samples for (required_states), "
            "or both. Supplying neither filtering criteria is "
            "not supported.")

    if splitter_category is None:
        results = _filter_sample_ids_from_category_state_coverage(
            metadata_map, metadata_map.SampleIds, coverage_category,
            subject_category, consider_state, min_num_states, required_states)
    else:
        # "Split" the metadata mapping file by extracting only sample IDs that
        # match the current splitter category state and using those for the
        # actual filtering.
        splitter_category_states = defaultdict(list)
        for samp_id in metadata_map.SampleIds:
            splitter_category_state = \
                    metadata_map.getCategoryValue(samp_id, splitter_category)
            splitter_category_states[splitter_category_state].append(samp_id)

        results = {}
        for splitter_category_state, sample_ids in \
            splitter_category_states.items():
            results[splitter_category_state] = \
                    _filter_sample_ids_from_category_state_coverage(
                            metadata_map, sample_ids, coverage_category,
                            subject_category, consider_state, min_num_states,
                            required_states)

    return results
示例#17
0
def _color_field_states(map_f, samp_ids, field, field_states, color_by_field):
    """Colors one field by another.

    Returns a list of matplotlib-compatible colors, one for each of the input
    field_states. Also returns a dictionary mapping color_by_field states to
    colors (useful for building a legend, for example).

    If there are not enough colors available (they are drawn from
    qiime.colors.data_colors), an error will be raised as the color mapping
    (and legend) will be ambiguous.

    A one-to-one mapping must exist between each field_state and its
    corresponding color_by field state (otherwise it is unclear which
    corresponding color_by field state should be used to color it by). An error
    will be raised if this one-to-one mapping does not exist.

    Arguments:
        map_f - the mapping file (file-like object)
        samp_ids - a list of sample IDs to consider in the mapping file. Only
            these sample IDs will be used when coloring field states
        field - the field in the mapping file to color
        field_states - the field states in field to color
        color_by_field - the field in the mapping file to color field_states by
    """
    colors = []
    color_pool = [matplotlib_rgb_color(data_colors[color].toRGB()) for color in data_color_order]
    metadata_map = MetadataMap.parseMetadataMap(map_f)

    for field_to_check in field, color_by_field:
        if field_to_check not in metadata_map.CategoryNames:
            raise ValueError("The field '%s' is not in the metadata mapping " "file's column headers." % field_to_check)

    all_field_states = metadata_map.getCategoryValues(samp_ids, field)
    all_color_by_states = metadata_map.getCategoryValues(samp_ids, color_by_field)

    if len(set(field_states) - set(all_field_states)) != 0:
        raise ValueError("Encountered unrecognizable field state(s) in %r " "for field '%s'." % (field_states, field))

    # Build mapping from one field to the other.
    field_mapping = defaultdict(list)
    for field_state, color_by_state in zip(all_field_states, all_color_by_states):
        if field_state in field_states:
            field_mapping[field_state].append(color_by_state)

    # For each of the specified input field states, find its corresponding
    # "color by" field state and give it a color if it hasn't been assigned one
    # yet. Make sure we have enough colors and there is a one-to-one mapping.
    color_mapping = {}
    for field_state in field_states:
        color_by_states = set(field_mapping[field_state])

        if len(color_by_states) != 1:
            raise ValueError(
                "The field '%s' to color by does not have a "
                "one-to-one mapping with field '%s'. Coloring "
                "would be ambiguous." % (color_by_field, field)
            )

        color_by_state = list(color_by_states)[0]
        if color_by_state not in color_mapping:
            if len(color_pool) > 0:
                color_mapping[color_by_state] = color_pool.pop(0)
            else:
                raise ValueError(
                    "There are not enough available QIIME colors "
                    "to color each of the field states in field "
                    "'%s'. Coloring would be ambiguous." % color_by_field
                )

        colors.append(color_mapping[color_by_state])

    return colors, color_mapping