Exemplo n.º 1
0
 def permanova_permdisp(self):
     # compute the permanova
     print('running permdisp\n\n')
     print(permdisp(distance_matrix=DistanceMatrix(self.dist_df),
                    grouping=[_.split('_')[0] for _ in list(self.dist_df)], permutations=999))
     print('running permanova\n\n')
     print(permanova(distance_matrix=DistanceMatrix(self.dist_df),
                     grouping=[_.split('_')[0] for _ in list(self.dist_df)], permutations=9999))
Exemplo n.º 2
0
def get_clusters(x_original, axis=['row', 'column'][0]):
    """Performs UPGMA clustering using euclidean distances"""
    x = x_original.copy()
    if axis == 'column':
        x = x.T
    nr = x.shape[0]
    metric_f = get_nonphylogenetic_metric('euclidean')
    row_dissims = DistanceMatrix(metric_f(x), map(str, range(nr)))
    # do upgma - rows
    # Average in SciPy's cluster.heirarchy.linkage is UPGMA
    linkage_matrix = linkage(row_dissims.condensed_form(), method='average')
    tree = TreeNode.from_linkage_matrix(linkage_matrix, row_dissims.ids)
    row_order = [int(tip.name) for tip in tree.tips()]
    return row_order
Exemplo n.º 3
0
def _compute_collapsed_dm(dm, i, j, disallow_negative_branch_length,
                          new_node_id):
    """Return the distance matrix resulting from joining ids i and j in a node.

    If the input distance matrix has shape ``(n, n)``, the result will have
    shape ``(n-1, n-1)`` as the ids `i` and `j` are collapsed to a single new
    ids.

    """
    in_n = dm.shape[0]
    out_n = in_n - 1
    out_ids = [new_node_id]
    out_ids.extend([e for e in dm.ids if e not in (i, j)])
    result = np.zeros((out_n, out_n))
    # pre-populate the result array with known distances
    ij_indexes = [dm.index(i), dm.index(j)]
    result[1:, 1:] = np.delete(np.delete(dm.data, ij_indexes, axis=0),
                               ij_indexes,
                               axis=1)
    # calculate the new distances from the current DistanceMatrix
    k_to_u = 0.5 * (dm[i] + dm[j] - dm[i, j])
    # set negative branches to 0 if specified
    if disallow_negative_branch_length:
        k_to_u[k_to_u < 0] = 0
    # drop nodes being joined
    k_to_u = np.delete(k_to_u, ij_indexes)
    # assign the distances to the result array
    result[0] = result[:, 0] = np.concatenate([[0], k_to_u])
    return DistanceMatrix(result, out_ids)
Exemplo n.º 4
0
    def setUp(self):
        self.test_dm = DistanceMatrix(
            np.array([
                [0, 1, 2, 3, 4],
                [1, 0, 4, 5, 6],
                [2, 4, 0, 6, 7],
                [3, 5, 6, 0, 8],
                [4, 6, 7, 8, 0],
            ]),
            ids=[f'S{i}' for i in range(5)],
        )

        n_samples = 100
        np.random.seed(825)
        sample_embedding = np.random.normal(size=(n_samples, 3)) + 2
        sample_embedding[:, 1] *= 3
        sample_embedding[:, 2] *= 6
        sample_df = pd.DataFrame(
            sample_embedding,
            index=[f'S{i}' for i in range(n_samples)],
            columns=[f'C{i}' for i in range(3)],
        )

        self.test_ord_results = OrdinationResults(
            'foo',
            'bar',
            eigvals=pd.Series(np.arange(n_samples)),
            samples=sample_df,
        )
Exemplo n.º 5
0
    def distances(self, distance_fn):
        """Compute distances between all pairs of sequences

        Parameters
        ----------
        distance_fn : function
            Function for computing the distance between a pair of sequences.
            This must take two sequences as input (as `skbio.Sequence` objects)
            and return a single integer or float value.

        Returns
        -------
        skbio.DistanceMatrix
            Matrix containing the distances between all pairs of sequences.

        """
        sequence_count = self.sequence_count()
        dm = np.zeros((sequence_count, sequence_count))
        ids = []
        for i in range(sequence_count):
            self_i = self[i]
            ids.append(self_i.metadata['id'])
            for j in range(i):
                dm[i, j] = dm[j, i] = self_i.distance(self[j], distance_fn)
        return DistanceMatrix(dm, ids)
Exemplo n.º 6
0
def _reduce(blocks):
    """Reduce an iterable of partial distance matrices into a full matrix

    Note, the reduce doesn't actually care about what pairs are computed
    so if a distance between pairs exists multiple times, it'll get
    added. as such, this reduction is only safe to perform if by
    the block_beta_diversity method which assures that distances are not
    computed multiple times.
    """
    all_blocks = list(blocks)

    # Determine the maximum integer ID observed in the blocks. There exists a
    # 1-1 mapping between the integer ID and a sample ID. We increment by 1
    # as the integer ID space begins with zero, and we'll be using this value
    # to determine the size of the resulting full distance matrix.
    n_ids = max(map(lambda x: max(x.ids), all_blocks)) + 1

    mat = np.zeros((n_ids, n_ids), dtype=float)

    # TODO: something smarter.
    for block in all_blocks:
        n_blk_ids = len(block.ids)

        # get the corresponding coordinates in the master matrix
        master_idx = [(i, j) for row, i in enumerate(block.ids)
                      for j in block.ids[row + 1:]]

        # get the corresponding coordinates within the current block
        block_idx = [(i, j) for row, i in enumerate(range(n_blk_ids))
                     for j in range(row + 1, n_blk_ids)]

        for (m_i, m_j), (b_i, b_j) in zip(master_idx, block_idx):
            mat[m_i, m_j] += block.data[b_i, b_j]

    return DistanceMatrix(mat + mat.T, list(range(n_ids)))
Exemplo n.º 7
0
def pcoa(lines):
    """Run PCoA on the distance matrix present on lines"""
    # Parse the distance matrix
    dist_mtx = DistanceMatrix.read(lines)
    # Create the PCoA object
    pcoa_obj = PCoA(dist_mtx)
    # Get the PCoA results and return them
    return pcoa_obj.scores()
Exemplo n.º 8
0
def pcoa(lines):
    """Run PCoA on the distance matrix present on lines"""
    # Parse the distance matrix
    dist_mtx = DistanceMatrix.read(lines)
    # Create the PCoA object
    pcoa_obj = PCoA(dist_mtx)
    # Get the PCoA results and return them
    return pcoa_obj.scores()
Exemplo n.º 9
0
def main():
    if not os.path.exists('./fasta_db'):
        os.mkdir('./fasta_db')

    if not os.path.exists('./RES'):
        os.mkdir('./RES')

    # skempi_v1 = obtain_seq('SKP1402m.ddg.txt', 'SKP1402m.seq.txt')
    skempi_v1 = obtain_seq('SKP1102s.ddg.txt', 'SKP1102s.seq.txt')

    write_to_fasta(skempi_v1, './fasta_db/skempi_v1_SKP1102s.fasta')
    chain_name, dist_mat = generate_dist_mat(fasta_Seq='./fasta_db/skempi_v1_SKP1102s.fasta',\
                                             dist_fun=dist_fun,dist_max=1,dist_unify_fun=min)

    # plot the distance matrix
    plt.imshow(dist_mat)
    plt.colorbar()
    plt.show()

    # To change linkage, specify linkage="complete" to linkage="single" or linkage="average"
    # the difference in the linkage can be found at
    # https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html, at comments of parameter linkage

    # Now cluster those that with identity > 25% together following the reviwer's comment
    # you can also choose to specify n_cluster, but then distance_threshold will need to be None
    # Just the uncomment the following two lines
    # Agg_cluster = AgglomerativeClustering(n_clusters=65, affinity="precomputed",
    #                                       linkage="complete", compute_full_tree=True, distance_threshold=None)
    Agg_cluster = AgglomerativeClustering(n_clusters=None, affinity="precomputed", \
                                          linkage="complete", compute_full_tree=True, distance_threshold=0.75)
    Agg_cluster.fit(dist_mat)

    # plot dendrogram
    cnd_dist_mat = DistanceMatrix(dist_mat).condensed_form()
    L = linkage(cnd_dist_mat, method='complete')
    plt.title('Dendrogram of sequences')
    dendrogram(L)
    plt.show()

    # save the result
    result = pd.DataFrame({"Chain": chain_name, "label": Agg_cluster.labels_})
    result.to_csv('./RES/SKP1102s_cluster_label.csv', index=False)
    check_dict = {k: str(v) for k, v in zip(chain_name, Agg_cluster.labels_)}

    # read in the skempi_v1 dataset and add the additional column
    # dataset = pd.read_csv('dataFile/SKP1402m.ddg.txt', sep='\t', header=None)
    dataset = pd.read_csv('dataFile/SKP1102s.ddg.txt', sep='\t', header=None)

    identifier_col = []
    for _, row in dataset.iterrows():
        identifier_col.append(label_obs(check_dict, row))
    dataset['class'] = identifier_col
    print(dataset.head())

    dataset.to_csv('./RES/SKP1102s.ddg_class.txt',
                   sep='\t',
                   index=False,
                   header=False)
Exemplo n.º 10
0
def single_file_nj(input_file, output_file):
    dm = DistanceMatrix.read(input_file)

    tree = nj(dm)

    # write output
    f = open(output_file, 'w')
    f.write(tree.to_newick(with_distances=True))
    f.close()
Exemplo n.º 11
0
def single_file_nj(input_file, output_file):
    dm = DistanceMatrix.read(input_file)

    tree = nj(dm)

    # write output
    f = open(output_file, 'w')
    f.write(tree.to_newick(with_distances=True))
    f.close()
Exemplo n.º 12
0
 def testPer(self, dist, group):
     per = self.permanova(dist, group)
     print(per[0])
     print(per[2])
     print(
         permanova(DistanceMatrix(dist, range(len(group))),
                   group,
                   column=None,
                   permutations=999))
Exemplo n.º 13
0
 def table_to_distances(table, pairwise_distance_fn):
     sample_ids = table.columns
     num_samples = len(sample_ids)
     data = zeros((num_samples, num_samples))
     for i, sample1_id in enumerate(sample_ids):
         for j, sample2_id in enumerate(sample_ids[:i]):
             data[i, j] = data[j, i] = pairwise_distance_fn(
                 table, sample1_id, sample2_id)
     return DistanceMatrix(data, sample_ids)
Exemplo n.º 14
0
def _order_dms(x, y, strict=True, lookup=None):
    """Intersect distance matrices and put them in the same order."""
    x_is_dm = isinstance(x, DistanceMatrix)
    y_is_dm = isinstance(y, DistanceMatrix)

    if (x_is_dm and not y_is_dm) or (y_is_dm and not x_is_dm):
        raise TypeError(
            "Mixing DistanceMatrix and array_like input types is not "
            "supported. Both x and y must either be DistanceMatrix instances "
            "or array_like, but not mixed.")
    elif x_is_dm and y_is_dm:
        if lookup is not None:
            x = _remap_ids(x, lookup, 'x', 'first')
            y = _remap_ids(y, lookup, 'y', 'second')

        if tuple(x.ids) == tuple(y.ids):
            return x, y

        id_order = [id_ for id_ in x.ids if id_ in y]
        num_matches = len(id_order)

        if (strict and ((num_matches != len(x.ids)) or
                        (num_matches != len(y.ids)))):
            raise ValueError("IDs exist that are not in both distance "
                             "matrices.")

        if num_matches < 1:
            raise ValueError("No matching IDs exist between the distance "
                             "matrices.")

        return x.filter(id_order), y.filter(id_order)
    else:
        # Both x and y aren't DistanceMatrix instances.
        if lookup is not None:
            raise ValueError("ID lookup can only be provided if inputs are "
                             "DistanceMatrix instances.")

        x = DistanceMatrix(x)
        y = DistanceMatrix(y)

        if x.shape != y.shape:
            raise ValueError("Distance matrices must have the same shape.")

        return x, y
def compute_aligned_sequence_distances(seqs, distance_fn=hamming_distance):
    dm = []
    ids = []
    for id1, seq1 in seqs:
        ids.append(id1)
        row = []
        for id2, seq2 in seqs:
            row.append(hamming_distance(seq1, seq2))
        dm.append(row)
    return DistanceMatrix(dm, ids)
def guide_tree_from_query_sequences(query_sequences,
                                    distance_fn=three_mer_distance,
                                    display_tree = False):
    guide_dm = []
    seq_ids = []
    for seq_id1, seq1 in query_sequences:
        seq_ids.append(seq_id1)
        row = []
        for seq_id2, seq2 in query_sequences:
            row.append(kmer_distance(seq1, seq2, k=3))
        guide_dm.append(row)

    guide_dm = DistanceMatrix(guide_dm, seq_ids)
    guide_lm = average(guide_dm.condensed_form())
    guide_tree = to_tree(guide_lm)
    if display_tree:
        guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right',
               link_color_func=lambda x: 'black')
    return guide_tree
Exemplo n.º 17
0
def _compute_q(dm):
    """Compute Q matrix, used to identify the next pair of nodes to join.

    """
    q = np.zeros(dm.shape)
    n = dm.shape[0]
    for i in range(n):
        for j in range(i):
            q[i, j] = q[j, i] = \
                ((n - 2) * dm[i, j]) - dm[i].sum() - dm[j].sum()
    return DistanceMatrix(q, dm.ids)
Exemplo n.º 18
0
def _compute_q(dm):
    """Compute Q matrix, used to identify the next pair of nodes to join.

    """
    q = np.zeros(dm.shape)
    n = dm.shape[0]
    big_sum = np.array([dm.data.sum(1)] * dm.shape[0])
    big_sum_diffs = big_sum + big_sum.T
    q = (n - 2) * dm.data - big_sum_diffs
    np.fill_diagonal(q, 0)
    return DistanceMatrix(q, dm.ids)
Exemplo n.º 19
0
def do_pcoa(infile):
    samples, distmtx = parse_distmat(infile)
    # coords, each row is an axis
    distmtx = DistanceMatrix(distmtx, ids=samples)
    ord_res = pcoa(distmtx)
    coords = ord_res.samples
    eigvals = ord_res.eigvals
    pcnts = ord_res.proportion_explained

    #Write results to output
    ord_res.write(sys.stdout)
def guide_tree_from_query_sequences(query_sequences,
                                    distance_fn=three_mer_distance,
                                    display_tree=False):
    guide_dm = []
    seq_ids = []
    for seq_id1, seq1 in query_sequences:
        seq_ids.append(seq_id1)
        row = []
        for seq_id2, seq2 in query_sequences:
            row.append(kmer_distance(seq1, seq2, k=3))
        guide_dm.append(row)

    guide_dm = DistanceMatrix(guide_dm, seq_ids)
    guide_lm = average(guide_dm.condensed_form())
    guide_tree = to_tree(guide_lm)
    if display_tree:
        guide_d = dendrogram(guide_lm,
                             labels=guide_dm.ids,
                             orientation='right',
                             link_color_func=lambda x: 'black')
    return guide_tree
Exemplo n.º 21
0
    def setUp(self):
        self.counts = pd.read_csv(get_data_path('analyses/raw_otu_table.csv'),
                                  sep='\t',
                                  dtype={'#SampleID': str})
        self.counts.set_index('#SampleID', inplace=True)

        self.metrics_beta = ["unweighted_unifrac", "bray_curtis"]

        self.beta = dict()
        for metric in self.metrics_beta:
            self.beta[metric] = DistanceMatrix.read(
                get_data_path('analyses/beta_%s.dm.txt' % metric))
Exemplo n.º 22
0
def get_dmat(embedding, leaf_names, metric='euclidean', logger=None):
    """
    Compute distances from embedding and return scikit-bio DistanceMatrix

    Args:
        embedding:          the embedding for each taxa
        leaf_names:         the leafe
    """
    if logger:
        logger.info("computing %s distances" % metric)
    dist = squareform(pdist(embedding, metric=metric))
    dmat = DistanceMatrix(dist, leaf_names)
    return dmat
Exemplo n.º 23
0
    def __call__(self, distance_matrix, output, verbose, *args, **kwargs):
        logger.info("Loading distance matrix...")
        dm = DistanceMatrix.read(distance_matrix)

        logger.info("Building tree...")
        tree = skbio.tree.nj(dm)
        tree = tree.root_at_midpoint()

        if verbose > 0:
            logger.info("Approximate tree using neighbour joining:\n%s",
                        tree.ascii_art())

        tree.write(output, format='newick')
        logger.info("Done.")
Exemplo n.º 24
0
def pw_distances(metric, counts, ids=None, **kwargs):
    """Compute distances between all pairs of columns in a counts matrix

    Parameters
    ----------
    metric : str, callable
        The pairwise distance function as a string or callable to use when
        generating pairwise distances. See the scipy ``pdist`` docs and the
        scikit-bio functions linked under *See Also* for available metrics.
    counts : 2D array_like of ints or floats
        Matrix containing count/abundance data where each row contains counts
        of observations in a given sample.
    ids : iterable of strs, optional
        Identifiers for each sample in ``counts``.

    Returns
    -------
    skbio.DistanceMatrix
        Distances between all pairs of samples (i.e., rows). The number of
        row and columns will be equal to the number of rows in ``counts``.

    Raises
    ------
    ValueError
        If ``len(ids) != len(counts)``.

    See Also
    --------
    unweighted_unifrac
    weighted_unifrac
    scipy.spatial.distance.pdist
    pw_distances_from_table

    """
    _skbio_metrics = _get_skbio_metrics()
    num_samples = len(counts)
    if ids is not None and num_samples != len(ids):
        raise ValueError(
            "Number of rows in counts must be equal to number of provided "
            "ids.")
    if metric in _skbio_metrics:
        metric = _skbio_metrics[metric]

    if callable(metric):
        metric = partial(metric, **kwargs)

    distances = pdist(counts, metric)
    return DistanceMatrix(
        squareform(distances, force='tomatrix', checks=False), ids)
Exemplo n.º 25
0
def js_TSNE(distributions):
    """Dimension reduction via Jensen-Shannon Divergence & t-distributed Stochastic Neighbor Embedding

    Parameters
    ----------
    distributions : array-like, shape (`n_dists`, `k`)
        Matrix of distributions probabilities.

    Returns
    -------
    t-SNE : array, shape (`n_dists`, 2)
    """
    dist_matrix = DistanceMatrix(dist.squareform(dist.pdist(distributions.values, _jensen_shannon)))
    model = TSNE(n_components=2, random_state=0, metric='precomputed')
    return model.fit_transform(dist_matrix.data)
Exemplo n.º 26
0
def js_PCoA(distributions):
   """Dimension reduction via Jensen-Shannon Divergence & Principal Components

    Parameters
    ----------
    distributions : array-like, shape (`n_dists`, `k`)
        Matrix of distributions probabilities.

    Returns
    -------
    pcoa : array, shape (`n_dists`, 2)
   """
   dist_matrix = DistanceMatrix(dist.squareform(dist.pdist(distributions.values, _jensen_shannon)))
   pcoa = PCoA(dist_matrix).scores()
   return pcoa.site[:,0:2]
Exemplo n.º 27
0
    def distances(self, distance_fn):
        """Compute distances between all pairs of sequences

        Parameters
        ----------
        distance_fn : function
            Function for computing the distance between a pair of sequences.
            This must take two sequences as input (as `skbio.Sequence` objects)
            and return a single integer or float value.

        Returns
        -------
        skbio.DistanceMatrix
            Matrix containing the distances between all pairs of sequences.

        See Also
        --------
        skbio.DistanceMatrix
        scipy.spatial.distance.hamming

        Examples
        --------
        >>> from scipy.spatial.distance import hamming
        >>> from skbio import SequenceCollection
        >>> from skbio import DNA
        >>> seqs = [DNA("ACCGGGTT", metadata={'id': "s1"}),
        ...         DNA("ACTTGGTT", metadata={'id': "s2"}),
        ...         DNA("ACTAGGTT", metadata={'id': "s3"})]
        >>> a1 = SequenceCollection(seqs)
        >>> print(a1.distances(hamming))
        3x3 distance matrix
        IDs:
        's1', 's2', 's3'
        Data:
        [[ 0.     0.25   0.25 ]
         [ 0.25   0.     0.125]
         [ 0.25   0.125  0.   ]]

        """
        sequence_count = self.sequence_count()
        dm = np.zeros((sequence_count, sequence_count))
        ids = []
        for i in range(sequence_count):
            self_i = self[i]
            ids.append(self_i.metadata['id'])
            for j in range(i):
                dm[i, j] = dm[j, i] = self_i.distance(self[j], distance_fn)
        return DistanceMatrix(dm, ids)
def table_to_distances(table, pairwise_distance_fn):
    """
    Function to make a distance matrix
    """
    from skbio.stats.distance import DistanceMatrix
    from numpy import zeros
    sample_ids = table.columns
    num_samples = len(sample_ids)
    data = zeros((num_samples, num_samples))
    for i, sample1_id in enumerate(sample_ids):
        for j, sample2_id in enumerate(sample_ids[:i]):
            data[i,
                 j] = data[j,
                           i] = pairwise_distance_fn(table, sample1_id,
                                                     sample2_id)

    return DistanceMatrix(data, sample_ids)
Exemplo n.º 29
0
def emb_tree(embedder,
             dist,
             leaf_names,
             target_tree,
             taxa_metadata,
             metric='euclidean',
             metric_kwargs=dict(),
             **fit_kwargs):
    emb = embedder.fit_transform(dist, **fit_kwargs)
    _dist = pdist(emb, metric=metric, **metric_kwargs)
    tree = upgma_tree(DistanceMatrix(squareform(_dist), leaf_names))
    ret = dict()
    ret['rfd'] = target_tree.compare_rfd(tree)
    ret['subsets'] = target_tree.compare_subsets(tree)
    ret['tip_distances'] = target_tree.compare_tip_distances(tree)
    ret['pearson'] = spearmanr(_dist, squareform(dist))[0]
    ret['spearman'] = pearsonr(_dist, squareform(dist))[0]
    ret.update(get_phylo_stats(tree, taxa_metadata))
    return ret
Exemplo n.º 30
0
def pw_distances_from_table(table, metric="braycurtis"):
    """Compute distances between all pairs of samples in table

    Parameters
    ----------
    table : biom.table.Table
        ``Table`` containing count/abundance data of observations across
        samples.
    metric : str, optional
        The name of the pairwise distance function to use when generating
        pairwise distances. See the scipy ``pdist`` docs, linked under *See
        Also*, for available metrics.

    Returns
    -------
    skbio.DistanceMatrix
        Distances between all pairs of samples. The number of row and columns
        will be equal to the number of samples in ``table``.

    See Also
    --------
    scipy.spatial.distance.pdist
    biom.table.Table
    pw_distances

    """
    warn(
        "pw_distances_from_table is deprecated. In the future (tentatively "
        "scikit-bio 0.2.0), pw_distance will take a biom.table.Table object "
        "and this function will be removed. You will need to update your "
        "code to call pw_distances at that time.", DeprecationWarning)
    sample_ids = table.ids(axis="sample")
    num_samples = len(sample_ids)

    # initialize the result object
    dm = np.zeros((num_samples, num_samples))
    for i, sid1 in enumerate(sample_ids):
        v1 = table.data(sid1)
        for j, sid2 in enumerate(sample_ids[:i]):
            v2 = table.data(sid2)
            dm[i, j] = dm[j, i] = pdist([v1, v2], metric)
    return DistanceMatrix(dm, sample_ids)
Exemplo n.º 31
0
def pw_distances(counts, ids=None, metric="braycurtis"):
    """Compute distances between all pairs of columns in a counts matrix

    Parameters
    ----------
    counts : 2D array_like of ints or floats
        Matrix containing count/abundance data where each row contains counts
        of observations in a given sample.
    ids : iterable of strs, optional
        Identifiers for each sample in ``counts``.
    metric : str, optional
        The name of the pairwise distance function to use when generating
        pairwise distances. See the scipy ``pdist`` docs, linked under *See
        Also*, for available metrics.

    Returns
    -------
    skbio.DistanceMatrix
        Distances between all pairs of samples (i.e., rows). The number of
        row and columns will be equal to the number of rows in ``counts``.

    Raises
    ------
    ValueError
        If ``len(ids) != len(counts)``.

    See Also
    --------
    scipy.spatial.distance.pdist
    pw_distances_from_table

    """
    num_samples = len(counts)
    if ids is not None and num_samples != len(ids):
        raise ValueError(
            "Number of rows in counts must be equal to number of provided "
            "ids.")

    distances = pdist(counts, metric)
    return DistanceMatrix(
        squareform(distances, force='tomatrix', checks=False), ids)
Exemplo n.º 32
0
def single_file_upgma(input_file, output_file):
    # read in dist matrix
    dist_mat = DistanceMatrix.read(input_file)

    # SciPy uses average as UPGMA:
    # http://docs.scipy.org/doc/scipy/reference/generated/
    #    scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage
    linkage_matrix = linkage(dist_mat.condensed_form(), method='average')

    tree = TreeNode.from_linkage_matrix(linkage_matrix, dist_mat.ids)

    # write output
    f = open(output_file, 'w')
    try:
        f.write(tree.to_newick(with_distances=True))
    except AttributeError:
        if c is None:
            raise RuntimeError("""input file %s did not make a UPGMA tree.
 Ensure it has more than one sample present""" % (str(input_file), ))
        raise
    f.close()
Exemplo n.º 33
0
def single_file_upgma(input_file, output_file):
    # read in dist matrix
    dist_mat = DistanceMatrix.read(input_file)

    # SciPy uses average as UPGMA:
    # http://docs.scipy.org/doc/scipy/reference/generated/
    #    scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage
    linkage_matrix = linkage(dist_mat.condensed_form(), method='average')

    tree = TreeNode.from_linkage_matrix(linkage_matrix, dist_mat.ids)

    # write output
    f = open(output_file, 'w')
    try:
        f.write(tree.to_newick(with_distances=True))
    except AttributeError:
        if c is None:
            raise RuntimeError("""input file %s did not make a UPGMA tree.
 Ensure it has more than one sample present""" % (str(input_file),))
        raise
    f.close()
Exemplo n.º 34
0
def _compute_collapsed_dm(dm, i, j, disallow_negative_branch_length,
                          new_node_id):
    """Return the distance matrix resulting from joining ids i and j in a node.

    If the input distance matrix has shape ``(n, n)``, the result will have
    shape ``(n-1, n-1)`` as the ids `i` and `j` are collapsed to a single new
    ids.

    """
    in_n = dm.shape[0]
    out_n = in_n - 1
    out_ids = [new_node_id]
    out_ids.extend([e for e in dm.ids if e not in (i, j)])
    result = np.zeros((out_n, out_n))
    for idx1, out_id1 in enumerate(out_ids[1:]):
        result[0, idx1 + 1] = result[idx1 + 1, 0] = _otu_to_new_node(
            dm, i, j, out_id1, disallow_negative_branch_length)
        for idx2, out_id2 in enumerate(out_ids[1:idx1 + 1]):
            result[idx1+1, idx2+1] = result[idx2+1, idx1+1] = \
                dm[out_id1, out_id2]
    return DistanceMatrix(result, out_ids)
Exemplo n.º 35
0
 def setUp(self):
     self.dm100 = DistanceMatrix.read(get_data_path('distMatrix_100.txt'))
     self.dm20 = DistanceMatrix.read(get_data_path('distMatrix_20_f5.txt'))
Exemplo n.º 36
0
samples = otu_table.index
graph_dm = pd.DataFrame(graph_dm,
                        index=samples,
                        columns=samples)
graph_dm.to_csv('../results/aitchison.txt', '\t')

# Read in graph_dm
graph_dm = pd.read_csv('../results/unconnected_aitchison.txt',
                       sep='\t', index_col=0)
# table = pd.read_table('../data/skinmap_chemiFrac_test.txt',
#                        sep='\t', index_col=0)
graph_dm.index = table.columns
graph_dm.columns = table.columns
# _dm = pw_distances('braycurtis', table.values, table.index.values)
# _dm.write('../results/braycurtis.txt')
_dm = DistanceMatrix(graph_dm.values + graph_dm.values.T)
_dm.ids = graph_dm.index
pcoa_v = pcoa(_dm)

fig = plt.figure(3)
plt.plot(pcoa_v.samples['PC1'],
         pcoa_v.samples['PC2'], 'ob')
# plt.plot(pcoa_v.eigvecs[not_stressed, 0],
#          pcoa_v.eigvecs[not_stressed, 1],
#          'o', c='#FFFFFF', label='Before stress')
# plt.plot(pcoa_v.eigvecs[stressed, 0],
#          pcoa_v.eigvecs[stressed, 1],
#          'o', c='#999999', label='After stress')
# plt.legend(loc=3)
#plt.title('Weighted Aitchison on Coral data')
#fig.savefig('../results/coral_chemifrac.png')
Exemplo n.º 37
0
def mantel(x, y, method='pearson', permutations=999, alternative='two-sided'):
    """Compute correlation between distance matrices using the Mantel test.

    The Mantel test compares two distance matrices by computing the correlation
    between the distances in the lower (or upper) triangular portions of the
    symmetric distance matrices. Correlation can be computed using Pearson's
    product-moment correlation coefficient or Spearman's rank correlation
    coefficient.

    As defined in [1]_, the Mantel test computes a test statistic :math:`r_M`
    given two symmetric distance matrices :math:`D_X` and :math:`D_Y`.
    :math:`r_M` is defined as

    .. math::

       r_M=\\frac{1}{d-1}\\sum_{i=1}^{n-1}\\sum_{j=i+1}^{n}
       stand(D_X)_{ij}stand(D_Y)_{ij}

    where

    .. math::

       d=\\frac{n(n-1)}{2}

    and :math:`n` is the number of rows/columns in each of the distance
    matrices. :math:`stand(D_X)` and :math:`stand(D_Y)` are distance matrices
    with their upper triangles containing standardized distances. Note that
    since :math:`D_X` and :math:`D_Y` are symmetric, the lower triangular
    portions of the matrices could equivalently have been used instead of the
    upper triangular portions (the current function behaves in this manner).

    If ``method='spearman'``, the above equation operates on ranked distances
    instead of the original distances.

    Statistical significance is assessed via a permutation test. The rows and
    columns of the first distance matrix (`x`) are randomly permuted a
    number of times (controlled via `permutations`). A correlation coefficient
    is computed for each permutation and the p-value is the proportion of
    permuted correlation coefficients that are equal to or more extreme
    than the original (unpermuted) correlation coefficient. Whether a permuted
    correlation coefficient is "more extreme" than the original correlation
    coefficient depends on the alternative hypothesis (controlled via
    `alternative`).

    Parameters
    ----------
    x, y : array_like or DistanceMatrix
        Input distance matrices to compare. Both matrices must have the same
        shape and be at least 3x3 in size. If ``array_like``, will be cast to
        ``DistanceMatrix`` (thus the requirements of a valid ``DistanceMatrix``
        apply to both `x` and `y`, such as symmetry and hollowness). If inputs
        are already ``DistanceMatrix`` instances, the IDs do not need to match
        between them; they are assumed to both be in the same order regardless
        of their IDs (the underlying data matrix is the only thing considered
        by this function).
    method : {'pearson', 'spearman'}
        Method used to compute the correlation between distance matrices.
    permutations : int, optional
        Number of times to randomly permute `x` when assessing statistical
        significance. Must be greater than or equal to zero. If zero,
        statistical significance calculations will be skipped and the p-value
        will be ``np.nan``.
    alternative : {'two-sided', 'greater', 'less'}
        Alternative hypothesis to use when calculating statistical
        significance. The default ``'two-sided'`` alternative hypothesis
        calculates the proportion of permuted correlation coefficients whose
        magnitude (i.e. after taking the absolute value) is greater than or
        equal to the absolute value of the original correlation coefficient.
        ``'greater'`` calculates the proportion of permuted coefficients that
        are greater than or equal to the original coefficient. ``'less'``
        calculates the proportion of permuted coefficients that are less than
        or equal to the original coefficient.

    Returns
    -------
    tuple of floats
        Correlation coefficient and p-value of the test.

    Raises
    ------
    ValueError
        If `x` and `y` are not the same shape and at least 3x3 in size, or an
        invalid `method`, number of `permutations`, or `alternative` are
        provided.

    See Also
    --------
    DistanceMatrix
    scipy.stats.pearsonr
    scipy.stats.spearmanr

    Notes
    -----
    The Mantel test was first described in [2]_. The general algorithm and
    interface are similar to ``vegan::mantel``, available in R's vegan
    package [3]_.

    ``np.nan`` will be returned for the p-value if `permutations` is zero or if
    the correlation coefficient is ``np.nan``. The correlation coefficient will
    be ``np.nan`` if one or both of the inputs does not have any variation
    (i.e. the distances are all constant) and ``method='spearman'``.

    References
    ----------
    .. [1] Legendre, P. and Legendre, L. (2012) Numerical Ecology. 3rd English
       Edition. Elsevier.

    .. [2] Mantel, N. (1967). "The detection of disease clustering and a
       generalized regression approach". Cancer Research 27 (2): 209-220. PMID
       6018555.

    .. [3] http://cran.r-project.org/web/packages/vegan/index.html

    Examples
    --------
    Define two 3x3 distance matrices:

    >>> x = [[0, 1, 2],
    ...      [1, 0, 3],
    ...      [2, 3, 0]]
    >>> y = [[0, 2, 7],
    ...      [2, 0, 6],
    ...      [7, 6, 0]]

    Compute the Pearson correlation between them and assess significance using
    a two-sided test with 999 permutations:

    >>> coeff, p_value = mantel(x, y)
    >>> round(coeff, 4)
    0.7559

    Thus, we see a moderate-to-strong positive correlation (:math:`r_M=0.7559`)
    between the two matrices.

    """
    if method == 'pearson':
        corr_func = pearsonr
    elif method == 'spearman':
        corr_func = spearmanr
    else:
        raise ValueError("Invalid correlation method '%s'." % method)

    if permutations < 0:
        raise ValueError("Number of permutations must be greater than or "
                         "equal to zero.")
    if alternative not in ('two-sided', 'greater', 'less'):
        raise ValueError("Invalid alternative hypothesis '%s'." % alternative)

    x = DistanceMatrix(x)
    y = DistanceMatrix(y)

    if x.shape != y.shape:
        raise ValueError("Distance matrices must have the same shape.")
    if x.shape[0] < 3:
        raise ValueError("Distance matrices must be at least 3x3 in size.")

    x_flat = x.condensed_form()
    y_flat = y.condensed_form()

    orig_stat = corr_func(x_flat, y_flat)[0]

    if permutations == 0 or np.isnan(orig_stat):
        p_value = np.nan
    else:
        perm_gen = (corr_func(x.permute(condensed=True), y_flat)[0]
                    for _ in range(permutations))
        permuted_stats = np.fromiter(perm_gen, np.float, count=permutations)

        if alternative == 'two-sided':
            count_better = (np.absolute(permuted_stats) >=
                            np.absolute(orig_stat)).sum()
        elif alternative == 'greater':
            count_better = (permuted_stats >= orig_stat).sum()
        else:
            count_better = (permuted_stats <= orig_stat).sum()

        p_value = (count_better + 1) / (permutations + 1)

    return orig_stat, p_value
Exemplo n.º 38
0
def compare_categories(dm_fp, map_fp, method, categories, num_perms, out_dir):
    """Runs the specified statistical method using the category of interest.

    This method does not return anything; all output is written to results
    files in out_dir.

    Arguments:
        dm_fp - filepath to the input distance matrix
        map_fp - filepath to the input metadata mapping file
        categories - list of categories in the metadata mapping file to
            consider in the statistical test. Multiple categories will only be
            considered if method is 'bioenv', otherwise only the first category
            will be considered
        num_perms - the number of permutations to use when calculating the
            p-value. If method is 'bioenv' or 'morans_i', this parameter will
            be ignored as they are not permutation-based methods
        out_dir - path to the output directory where results files will be
            written. It is assumed that this directory already exists and we
            have write permissions to it
    """
    # Make sure we were passed a list of categories, not a single string.
    if not isinstance(categories, ListType):
        raise TypeError("The supplied categories must be a list of "
                        "strings.")

    # Special case: we do not allow SampleID as it is not a category, neither
    # in data structure representation nor in terms of a statistical test (no
    # groups are formed since all entries are unique IDs).
    if 'SampleID' in categories:
        raise ValueError("Cannot use SampleID as a category because it is a "
                         "unique identifier for each sample, and thus does "
                         "not create groups of samples (nor can it be used as "
                         "a numeric category in Moran's I or BIO-ENV "
                         "analyses). Please choose a different metadata "
                         "column to perform statistical tests on.")

    dm = DistanceMatrix.read(dm_fp)

    if method in ('anosim', 'permanova', 'bioenv'):
        with open(map_fp, 'U') as map_f:
            md_dict = parse_mapping_file_to_dict(map_f)[0]
        df = pd.DataFrame.from_dict(md_dict, orient='index')

        out_fp = join(out_dir, '%s_results.txt' % method)

        if method in ('anosim', 'permanova'):
            if method == 'anosim':
                method_cls = ANOSIM
            elif method == 'permanova':
                method_cls = PERMANOVA

            method_inst = method_cls(dm, df, column=categories[0])
            results = method_inst(num_perms)

            with open(out_fp, 'w') as out_f:
                out_f.write(results.summary())
        elif method == 'bioenv':
            results = bioenv(dm, df, columns=categories)
            results.to_csv(out_fp, sep='\t')
    else:
        # Remove any samples from the mapping file that aren't in the distance
        # matrix (important for validation checks). Use strict=True so that an
        # error is raised if the distance matrix contains any samples that
        # aren't in the mapping file.
        with open(map_fp, 'U') as map_f:
            md_map = MetadataMap.parseMetadataMap(map_f)
        md_map.filterSamples(dm.ids, strict=True)

        # These methods are run in R. Input validation must be done here before
        # running the R commands.
        if method in ['adonis', 'morans_i', 'mrpp', 'permdisp', 'dbrda']:
            # Check to make sure all categories passed in are in mapping file
            # and are not all the same value.
            for category in categories:
                if not category in md_map.CategoryNames:
                    raise ValueError("Category '%s' not found in mapping file "
                                     "columns." % category)

                if md_map.hasSingleCategoryValue(category):
                    raise ValueError("All values in category '%s' are the "
                                     "same. The statistical method '%s' "
                                     "cannot operate on a category that "
                                     "creates only a single group of samples "
                                     "(e.g. there are no 'between' distances "
                                     "because there is only a single group)."
                                     % (category, method))

            # Build the command arguments string.
            command_args = ['-d %s -m %s -c %s -o %s'
                            % (dm_fp, map_fp, categories[0], out_dir)]

            if method == 'morans_i':
                # Moran's I requires only numeric categories.
                for category in categories:
                    if not md_map.isNumericCategory(category):
                        raise TypeError("The category '%s' is not numeric. "
                                        "Not all values could be converted to "
                                        "numbers." % category)
            else:
                # The rest require groups of samples, so the category values
                # cannot all be unique.
                for category in categories:
                    if md_map.hasUniqueCategoryValues(category):
                        raise ValueError("All values in category '%s' are "
                                         "unique. This statistical method "
                                         "cannot operate on a category with "
                                         "unique values (e.g. there are no "
                                         "'within' distances because each "
                                         "group of samples contains only a "
                                         "single sample)." % category)

                # Only Moran's I doesn't accept a number of permutations.
                if num_perms < 0:
                    raise ValueError("The number of permutations must be "
                                     "greater than or equal to zero.")

                command_args[0] += ' -n %d' % num_perms

            rex = RExecutor(TmpDir=get_qiime_temp_dir())
            rex(command_args, '%s.r' % method, output_dir=out_dir)
        else:
            raise ValueError("Unrecognized method '%s'. Valid methods: %r"
                             % (method, methods))
Exemplo n.º 39
0
def pwmantel(dms, labels=None, method='pearson', permutations=999,
             alternative='two-sided', strict=True, lookup=None):
    """Run Mantel tests for every pair of given distance matrices.

    Runs a Mantel test for each pair of distance matrices and collates the
    results in a ``DataFrame``. Distance matrices do not need to be in the same
    ID order if they are ``DistanceMatrix`` instances. Distance matrices will
    be re-ordered prior to running each pairwise test, and if ``strict=False``,
    IDs that don't match between a pair of distance matrices will be dropped
    prior to running the test (otherwise a ``ValueError`` will be raised if
    there are nonmatching IDs between any pair of distance matrices).

    Parameters
    ----------
    dms : iterable of DistanceMatrix objects, array_like objects, or filepaths
        to distance matrices. If they are ``array_like``, no reordering or
        matching of IDs will be performed.
    labels : iterable of str or int, optional
        Labels for each distance matrix in `dms`. These are used in the results
        ``DataFrame`` to identify the pair of distance matrices used in a
        pairwise Mantel test. If ``None``, defaults to monotonically-increasing
        integers starting at zero.
    method : {'pearson', 'spearman'}
        Correlation method. See ``mantel`` function for more details.
    permutations : int, optional
        Number of permutations. See ``mantel`` function for more details.
    alternative : {'two-sided', 'greater', 'less'}
        Alternative hypothesis. See ``mantel`` function for more details.
    strict : bool, optional
        Handling of nonmatching IDs. See ``mantel`` function for more details.
    lookup : dict, optional
        Map existing IDs to new IDs. See ``mantel`` function for more details.

    Returns
    -------
    pandas.DataFrame
        ``DataFrame`` containing the results of each pairwise test (one per
        row). Includes the number of objects considered in each test as column
        ``n`` (after applying `lookup` and filtering nonmatching IDs if
        ``strict=False``). Column ``p-value`` will display p-values as ``NaN``
        if p-values could not be computed (they are stored as ``np.nan`` within
        the ``DataFrame``; see ``mantel`` for more details).

    See Also
    --------
    mantel
    DistanceMatrix.read

    Notes
    --------
    Passing a list of filepaths can be useful as it allows for a smaller amount
    of memory consumption as it only loads two matrices at a time as opposed to
    loading all distance matrices into memory.

    Examples
    --------
    Import the functionality we'll use in the following examples:

    >>> from skbio import DistanceMatrix
    >>> from skbio.stats.distance import pwmantel

    Define three 3x3 distance matrices:

    >>> x = DistanceMatrix([[0, 1, 2],
    ...                     [1, 0, 3],
    ...                     [2, 3, 0]])
    >>> y = DistanceMatrix([[0, 2, 7],
    ...                     [2, 0, 6],
    ...                     [7, 6, 0]])
    >>> z = DistanceMatrix([[0, 5, 6],
    ...                     [5, 0, 1],
    ...                     [6, 1, 0]])

    Run Mantel tests for each pair of distance matrices (there are 3 possible
    pairs):

    >>> pwmantel((x, y, z), labels=('x', 'y', 'z'),
    ...          permutations=0) # doctest: +NORMALIZE_WHITESPACE
                 statistic p-value  n   method  permutations alternative
    dm1 dm2
    x   y     0.755929     NaN  3  pearson             0   two-sided
        z    -0.755929     NaN  3  pearson             0   two-sided
    y   z    -0.142857     NaN  3  pearson             0   two-sided

    Note that we passed ``permutations=0`` to suppress significance tests; the
    p-values in the output are labelled ``NaN``.

    """
    num_dms = len(dms)

    if num_dms < 2:
        raise ValueError("Must provide at least two distance matrices.")

    if labels is None:
        labels = range(num_dms)
    else:
        if num_dms != len(labels):
            raise ValueError("Number of labels must match the number of "
                             "distance matrices.")
        if len(set(labels)) != len(labels):
            raise ValueError("Labels must be unique.")

    num_combs = scipy.special.comb(num_dms, 2, exact=True)
    results_dtype = [('dm1', object), ('dm2', object), ('statistic', float),
                     ('p-value', float), ('n', int), ('method', object),
                     ('permutations', int), ('alternative', object)]
    results = np.empty(num_combs, dtype=results_dtype)

    for i, pair in enumerate(combinations(zip(labels, dms), 2)):
        (xlabel, x), (ylabel, y) = pair
        if isinstance(x, str):
            x = DistanceMatrix.read(x)
        if isinstance(y, str):
            y = DistanceMatrix.read(y)

        stat, p_val, n = mantel(x, y, method=method, permutations=permutations,
                                alternative=alternative, strict=strict,
                                lookup=lookup)

        results[i] = (xlabel, ylabel, stat, p_val, n, method, permutations,
                      alternative)

    return pd.DataFrame.from_records(results, index=('dm1', 'dm2'))