def guide_tree_from_query_sequences(query_sequences, 
                                    distance_fn=three_mer_distance,
                                    display_tree = False):
    guide_dm = []
    seq_ids = []
    for seq_id1, seq1 in query_sequences:
        seq_ids.append(seq_id1)
        row = []
        for seq_id2, seq2 in query_sequences:
            row.append(kmer_distance(seq1, seq2, k=3))
        guide_dm.append(row)
    
    guide_dm = SymmetricDistanceMatrix(guide_dm, seq_ids)
    guide_lm = average(guide_dm.condensed_form())
    guide_tree = to_tree(guide_lm)
    if display_tree:
        guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right', 
               link_color_func=lambda x: 'black')
    return guide_tree
def run_mantel_test(method, fps, distmats, num_perms, tail_type, comment,
                    control_dm_fp=None, control_dm=None,
                    sample_id_map=None):
    """Runs a Mantel test on all pairs of distance matrices.

    Returns a string suitable for writing out to a file containing the results
    of the test.

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.

    Arguments:
        method - which Mantel test to run (either 'mantel' or 'partial_mantel')
        fps - list of filepaths of the distance matrices
        distmats - list of tuples containing dm labels and dm data (i.e. the
            output of parse_distmat)
        num_perms - the number of permutations to use to calculate the
            p-value(s)
        tail_type - the type of tail test to use when calculating the
            p-value(s). Can be 'two sided', 'greater', or 'less'. Only applies
            when method is mantel
        comment - comment string to add to the beginning of the results string
        control_dm_fp - filepath of the control distance matrix. Only applies
            when method is partial_mantel (it is required then)
        control_dm - tuple containing control distance matrix labels and matrix
            data. Only applies when method is partial_mantel (it is required
            then)
        sample_id_map - dict mapping sample IDs (i.e. what is expected by
            make_compatible_distance_matrices)
    """
    if len(fps) != len(distmats):
        raise ValueError("Must provide the same number of filepaths as there "
                         "are distance matrices.")
    if comment is None:
        comment = ''
    result = comment

    if method == 'mantel':
        result += 'DM1\tDM2\tNumber of entries\tMantel r statistic\t' + \
                  'p-value\tNumber of permutations\tTail type\n'
    elif method == 'partial_mantel':
        if not control_dm_fp or not control_dm:
            raise ValueError("You must provide a control matrix filepath and "
                             "control matrix when running the partial Mantel "
                             "test.")
        result += 'DM1\tDM2\tCDM\tNumber of entries\t' + \
            'Mantel r statistic\tp-value\tNumber of permutations\t' +\
            'Tail type\n'
    else:
        raise ValueError("Invalid method '%s'. Must be either 'mantel' or "
                         "'partial_mantel'." % method)

    # Loop over all pairs of dms.
    for i, (fp1, (dm1_labels, dm1_data)) in enumerate(zip(fps, distmats)):
        for fp2, (dm2_labels, dm2_data) in zip(fps, distmats)[i + 1:]:
            # Make the current pair of distance matrices compatible by only
            # keeping samples that match between them, and ordering them by
            # the same sample IDs.
            (dm1_labels, dm1_data), (dm2_labels, dm2_data) = \
                make_compatible_distance_matrices((dm1_labels, dm1_data),
                                                  (dm2_labels, dm2_data), lookup=sample_id_map)
            if method == 'partial_mantel':
                # We need to intersect three sets (three matrices).
                (dm1_labels, dm1_data), (cdm_labels, cdm_data) = \
                    make_compatible_distance_matrices(
                        (dm1_labels, dm1_data), control_dm,
                        lookup=sample_id_map)
                (dm1_labels, dm1_data), (dm2_labels, dm2_data) = \
                    make_compatible_distance_matrices(
                        (dm1_labels, dm1_data), (dm2_labels, dm2_data),
                        lookup=sample_id_map)
                if len(dm1_labels) < 3:
                    result += '%s\t%s\t%s\t%d\tToo few samples\n' % (fp1,
                                                                     fp2, control_dm_fp, len(dm1_labels))
                    continue
            elif len(dm1_labels) < 3:
                result += '%s\t%s\t%d\tToo few samples\n' % (fp1, fp2,
                                                             len(dm1_labels))
                continue

            dm1 = SymmetricDistanceMatrix(dm1_data, dm1_labels)
            dm2 = SymmetricDistanceMatrix(dm2_data, dm2_labels)

            # Create an instance of our correlation test and run it with
            # the specified number of permutations.
            if method == 'mantel':
                results = Mantel(dm1, dm2, tail_type)(num_perms)
                p_str = format_p_value_for_num_iters(results['p_value'],
                                                     num_perms)
                result += "%s\t%s\t%d\t%.5f\t%s\t%d\t%s\n" % (fp1, fp2,
                                                              len(dm1_labels), results[
                                                                  'r_value'], p_str,
                                                              num_perms, tail_type)
            elif method == 'partial_mantel':
                cdm = SymmetricDistanceMatrix(cdm_data, cdm_labels)
                results = PartialMantel(dm1, dm2, cdm)(num_perms)
                p_str = format_p_value_for_num_iters(results['mantel_p'],
                                                     num_perms)
                result += "%s\t%s\t%s\t%d\t%.5f\t%s\t%d\t%s\n" % (fp1, fp2,
                                                                  control_dm_fp, len(
                                                                      dm1_labels),
                                                                  results['mantel_r'], p_str, num_perms, 'greater')
    return result
Пример #3
0
def compare_categories(dm_fp, map_fp, method, categories, num_perms, out_dir):
    """Runs the specified statistical method using the category of interest.

    This method does not return anything; all output is written to results
    files in out_dir.

    Arguments:
        dm_fp - filepath to the input distance matrix
        map_fp - filepath to the input metadata mapping file
        categories - list of categories in the metadata mapping file to
            consider in the statistical test. Multiple categories will only be
            considered if method is 'best', otherwise only the first category
            will be considered
        num_perms - the number of permutations to use when calculating the
            p-value. If method is 'best' or 'morans_i', this parameter will be
            ignored as they are not permutation-based methods
        out_dir - path to the output directory where results files will be
            written. It is assumed that this directory already exists and we
            have write permissions to it
    """

    # Make sure we were passed a list of categories, not a single string.
    if not isinstance(categories, ListType):
        raise TypeError("The supplied categories must be a list of "
                        "strings.")

    # Special case: we do not allow SampleID as it is not a category, neither
    # in data structure representation nor in terms of a statistical test (no
    # groups are formed since all entries are unique IDs).
    if 'SampleID' in categories:
        raise ValueError("Cannot use SampleID as a category because it is a "
                         "unique identifier for each sample, and thus does "
                         "not create groups of samples (nor can it be used as "
                         "a numeric category in Moran's I or BEST analyses). "
                         "Please use a different metadata column to perform "
                         "statistical tests on.")

    # Parse the mapping file and distance matrix.
    with open(map_fp, 'U') as map_f:
        md_map = MetadataMap.parseMetadataMap(map_f)

    with open(dm_fp, 'U') as dm_f:
        dm = SymmetricDistanceMatrix.from_file(dm_f)

    # Remove any samples from the mapping file that aren't in the distance
    # matrix (important for validation checks). Use strict=True so that an
    # error is raised if the distance matrix contains any samples that aren't
    # in the mapping file.
    md_map.filterSamples(dm.ids, strict=True)

    # Run the specified statistical method.
    if method in ['adonis', 'morans_i', 'mrpp', 'permdisp', 'dbrda']:
        # These methods are run in R. Input validation must be done here before
        # running the R commands. The pure-Python implementations perform all
        # validation in the classes in the stats module.

        # Check to make sure all categories passed in are in mapping file and
        # are not all the same value.
        for category in categories:
            if not category in md_map.CategoryNames:
                raise ValueError("Category '%s' not found in mapping file "
                                 "columns." % category)

            if md_map.hasSingleCategoryValue(category):
                raise ValueError("All values in category '%s' are the "
                                 "same. The statistical method '%s' cannot "
                                 "operate on a category that creates only "
                                 "a single group of samples (e.g. there "
                                 "are no 'between' distances because "
                                 "there is only a single group)."
                                 % (category, method))

        # Build the command arguments string.
        command_args = ['-d %s -m %s -c %s -o %s'
                        % (dm_fp, map_fp, categories[0], out_dir)]

        if method == 'morans_i':
            # Moran's I requires only numeric categories.
            for category in categories:
                if not md_map.isNumericCategory(category):
                    raise TypeError("The category '%s' is not numeric. Not "
                                    "all values could be converted to numbers."
                                    % category)
        else:
            # The rest require groups of samples, so the category values cannot
            # all be unique.
            for category in categories:
                if md_map.hasUniqueCategoryValues(category):
                    raise ValueError("All values in category '%s' are unique. "
                                     "This statistical method cannot operate "
                                     "on a category with unique values (e.g. "
                                     "there are no 'within' distances because "
                                     "each group of samples contains only a "
                                     "single sample)." % category)

            # Only Moran's I doesn't accept a number of permutations.
            if num_perms < 0:
                raise ValueError("The number of permutations must be greater "
                                 "than or equal to zero.")

            command_args[0] += ' -n %d' % num_perms

        rex = RExecutor(TmpDir=get_qiime_temp_dir())
        rex(command_args, '%s.r' % method, output_dir=out_dir)
    elif method == 'anosim':
        anosim = Anosim(md_map, dm, categories[0])
        anosim_results = anosim(num_perms)

        out_f = open(join(out_dir, '%s_results.txt' % method), 'w+')
        out_f.write(format_anosim_results(anosim_results))
        out_f.close()
    elif method == 'best':
        best = Best(dm, md_map, categories)
        best_results = best()

        out_f = open(join(out_dir, '%s_results.txt' % method), 'w+')
        out_f.write(format_best_results(best_results))
        out_f.close()
    elif method == 'permanova':
        permanova = Permanova(md_map, dm, categories[0])
        permanova_results = permanova(num_perms)

        out_f = open(join(out_dir, '%s_results.txt' % method), 'w+')
        out_f.write(format_permanova_results(permanova_results))
        out_f.close()
    else:
        raise ValueError("Unrecognized method '%s'. Valid methods: %r"
                         % (method, methods))
def run_mantel_correlogram(fps, distmats, num_perms, comment, alpha,
                           sample_id_map=None,
                           variable_size_distance_classes=False):
    """Runs a Mantel correlogram analysis on all pairs of distance matrices.

    Returns a string suitable for writing out to a file containing the results
    of the test, a list of correlogram filepath names, and a list of matplotlib
    Figure objects representing each correlogram.

    The correlogram filepaths can have an extension string appended to the end
    of them and then be used to save each of the correlogram Figures to a file.
    Each correlogram filepath will be a combination of the two distance matrix
    filepaths that were used to create it.

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.

    Arguments:
        fps - list of filepaths of the distance matrices
        distmats - list of tuples containing dm labels and dm data (i.e. the
            output of parse_distmat)
        num_perms - the number of permutations to use to calculate the
            p-value(s)
        comment - comment string to add to the beginning of the results string
        alpha - the alpha value to use to determine significance in the
            correlogram plots
        sample_id_map - dict mapping sample IDs (i.e. what is expected by
            make_compatible_distance_matrices)
        variable_size_distance_classes - create distance classes that vary in
            size (i.e. width) but have the same number of distances in each
            class
    """
    if len(fps) != len(distmats):
        raise ValueError("Must provide the same number of filepaths as there "
                         "are distance matrices.")
    if comment is None:
        comment = ''
    result = comment + 'DM1\tDM2\tNumber of entries\t' + \
                       'Number of permutations\tClass index\t' + \
                       'Number of distances\tMantel r statistic\t' + \
                       'p-value\tp-value (Bonferroni corrected)\tTail type\n'
    correlogram_fps = []
    correlograms = []

    # Loop over all pairs of dms.
    for i, (fp1, (dm1_labels, dm1_data)) in enumerate(zip(fps, distmats)):
        for fp2, (dm2_labels, dm2_data) in zip(fps, distmats)[i + 1:]:
            # Make the current pair of distance matrices compatible by only
            # keeping samples that match between them, and ordering them by
            # the same sample IDs.
            (dm1_labels, dm1_data), (dm2_labels, dm2_data) = \
                make_compatible_distance_matrices((dm1_labels, dm1_data),
                                                  (dm2_labels, dm2_data), lookup=sample_id_map)
            if len(dm1_labels) < 3:
                result += '%s\t%s\t%d\tToo few samples\n' % (fp1, fp2,
                                                             len(dm1_labels))
                continue

            dm1 = SymmetricDistanceMatrix(dm1_data, dm1_labels)
            dm2 = SymmetricDistanceMatrix(dm2_data, dm2_labels)

            # Create an instance of our Mantel correlogram test and run it with
            # the specified number of permutations.
            mc = MantelCorrelogram(dm1, dm2, alpha=alpha,
                                   variable_size_distance_classes=variable_size_distance_classes)
            results = mc(num_perms)

            # Generate a name for the current correlogram and save it and the
            # correlogram itself.
            dm1_name = path.basename(fp1)
            dm2_name = path.basename(fp2)
            correlogram_fps.append('_'.join((dm1_name, 'AND', dm2_name,
                                             'mantel_correlogram')) + '.')
            correlograms.append(results['correlogram_plot'])

            # Iterate over the results and write them to the text file.
            first_time = True
            for class_idx, num_dist, r, p, p_corr in zip(
                    results['class_index'], results['num_dist'],
                    results['mantel_r'], results['mantel_p'],
                    results['mantel_p_corr']):
                # Format p-values and figure out which tail type we have based
                # on the sign of r.
                p_str = None
                if p is not None:
                    p_str = format_p_value_for_num_iters(p, num_perms)
                p_corr_str = None
                if p_corr is not None:
                    p_corr_str = format_p_value_for_num_iters(p_corr,
                                                              num_perms)
                if r is None:
                    tail_type = None
                elif r < 0:
                    tail_type = 'less'
                else:
                    tail_type = 'greater'

                if first_time:
                    result += '%s\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\n' % (
                        fp1, fp2, len(dm1_labels), num_perms, class_idx,
                        num_dist, r, p_str, p_corr_str, tail_type)
                    first_time = False
                else:
                    result += '\t\t\t\t%s\t%d\t%s\t%s\t%s\t%s\n' % (class_idx,
                                                                    num_dist, r, p_str, p_corr_str, tail_type)
    return result, correlogram_fps, correlograms
def compare_categories(dm_fp, map_fp, method, categories, num_perms, out_dir):
    """Runs the specified statistical method using the category of interest.

    This method does not return anything; all output is written to results
    files in out_dir.

    Arguments:
        dm_fp - filepath to the input distance matrix
        map_fp - filepath to the input metadata mapping file
        categories - list of categories in the metadata mapping file to
            consider in the statistical test. Multiple categories will only be
            considered if method is 'best', otherwise only the first category
            will be considered
        num_perms - the number of permutations to use when calculating the
            p-value. If method is 'best' or 'morans_i', this parameter will be
            ignored as they are not permutation-based methods
        out_dir - path to the output directory where results files will be
            written. It is assumed that this directory already exists and we
            have write permissions to it
    """

    # Make sure we were passed a list of categories, not a single string.
    if not isinstance(categories, ListType):
        raise TypeError("The supplied categories must be a list of "
                        "strings.")

    # Special case: we do not allow SampleID as it is not a category, neither
    # in data structure representation nor in terms of a statistical test (no
    # groups are formed since all entries are unique IDs).
    if 'SampleID' in categories:
        raise ValueError("Cannot use SampleID as a category because it is a "
                         "unique identifier for each sample, and thus does "
                         "not create groups of samples (nor can it be used as "
                         "a numeric category in Moran's I or BEST analyses). "
                         "Please use a different metadata column to perform "
                         "statistical tests on.")

    # Parse the mapping file and distance matrix.
    with open(map_fp, 'U') as map_f:
        md_map = MetadataMap.parseMetadataMap(map_f)

    with open(dm_fp, 'U') as dm_f:
        dm = SymmetricDistanceMatrix.from_file(dm_f)

    # Remove any samples from the mapping file that aren't in the distance
    # matrix (important for validation checks). Use strict=True so that an
    # error is raised if the distance matrix contains any samples that aren't
    # in the mapping file.
    md_map.filterSamples(dm.ids, strict=True)

    # Run the specified statistical method.
    if method in ['adonis', 'morans_i', 'mrpp', 'permdisp', 'dbrda']:
        # These methods are run in R. Input validation must be done here before
        # running the R commands. The pure-Python implementations perform all
        # validation in the classes in the stats module.

        # Check to make sure all categories passed in are in mapping file and
        # are not all the same value.
        for category in categories:
            if not category in md_map.CategoryNames:
                raise ValueError("Category '%s' not found in mapping file "
                                 "columns." % category)

            if md_map.hasSingleCategoryValue(category):
                raise ValueError("All values in category '%s' are the "
                                 "same. The statistical method '%s' cannot "
                                 "operate on a category that creates only "
                                 "a single group of samples (e.g. there "
                                 "are no 'between' distances because "
                                 "there is only a single group)." %
                                 (category, method))

        # Build the command arguments string.
        command_args = [
            '-d %s -m %s -c %s -o %s' % (dm_fp, map_fp, categories[0], out_dir)
        ]

        if method == 'morans_i':
            # Moran's I requires only numeric categories.
            for category in categories:
                if not md_map.isNumericCategory(category):
                    raise TypeError(
                        "The category '%s' is not numeric. Not "
                        "all values could be converted to numbers." % category)
        else:
            # The rest require groups of samples, so the category values cannot
            # all be unique.
            for category in categories:
                if md_map.hasUniqueCategoryValues(category):
                    raise ValueError("All values in category '%s' are unique. "
                                     "This statistical method cannot operate "
                                     "on a category with unique values (e.g. "
                                     "there are no 'within' distances because "
                                     "each group of samples contains only a "
                                     "single sample)." % category)

            # Only Moran's I doesn't accept a number of permutations.
            if num_perms < 0:
                raise ValueError("The number of permutations must be greater "
                                 "than or equal to zero.")

            command_args[0] += ' -n %d' % num_perms

        rex = RExecutor(TmpDir=get_qiime_temp_dir())
        rex(command_args, '%s.r' % method, output_dir=out_dir)
    elif method == 'anosim':
        anosim = Anosim(md_map, dm, categories[0])
        anosim_results = anosim(num_perms)

        out_f = open(join(out_dir, '%s_results.txt' % method), 'w+')
        out_f.write(format_anosim_results(anosim_results))
        out_f.close()
    elif method == 'best':
        best = Best(dm, md_map, categories)
        best_results = best()

        out_f = open(join(out_dir, '%s_results.txt' % method), 'w+')
        out_f.write(format_best_results(best_results))
        out_f.close()
    elif method == 'permanova':
        permanova = Permanova(md_map, dm, categories[0])
        permanova_results = permanova(num_perms)

        out_f = open(join(out_dir, '%s_results.txt' % method), 'w+')
        out_f.write(format_permanova_results(permanova_results))
        out_f.close()
    else:
        raise ValueError("Unrecognized method '%s'. Valid methods: %r" %
                         (method, methods))