Exemplo n.º 1
0
 def test_from_iterable_validate_equal_valid_data(self):
     validate_true = DistanceMatrix.from_iterable((x for x in range(4)),
                                                  lambda a, b: abs(b - a),
                                                  validate=True)
     validate_false = DistanceMatrix.from_iterable((x for x in range(4)),
                                                   lambda a, b: abs(b - a),
                                                   validate=False)
     self.assertEqual(validate_true, validate_false)
Exemplo n.º 2
0
 def test_from_iterable_validate_false_non_symmetric(self):
     exp = DistanceMatrix([[0, 1, 2, 3],
                           [1, 0, 1, 2],
                           [2, 1, 0, 1],
                           [3, 2, 1, 0]])
     res = DistanceMatrix.from_iterable((x for x in range(4)),
                                        lambda a, b: a - b,
                                        validate=False)
     self.assertEqual(res, exp)
Exemplo n.º 3
0
    def test_from_iterable_no_key(self):
        iterable = (x for x in range(4))

        exp = DistanceMatrix([[0, 1, 2, 3],
                              [1, 0, 1, 2],
                              [2, 1, 0, 1],
                              [3, 2, 1, 0]])
        res = DistanceMatrix.from_iterable(iterable, lambda a, b: abs(b - a))
        self.assertEqual(res, exp)
Exemplo n.º 4
0
    def test_init_invalid_input(self):
        # Asymmetric.
        data = [[0.0, 2.0], [1.0, 0.0]]
        with self.assertRaises(DistanceMatrixError):
            DistanceMatrix(data, ['a', 'b'])

        # Ensure that the superclass validation is still being performed.
        with self.assertRaises(DissimilarityMatrixError):
            DistanceMatrix([[1, 2, 3]], ['a'])
Exemplo n.º 5
0
    def setUp(self):
        super(DistanceMatrixTests, self).setUp()

        self.dm_1x1 = DistanceMatrix(self.dm_1x1_data, ['a'])
        self.dm_2x2 = DistanceMatrix(self.dm_2x2_data, ['a', 'b'])
        self.dm_3x3 = DistanceMatrix(self.dm_3x3_data, ['a', 'b', 'c'])

        self.dms = [self.dm_1x1, self.dm_2x2, self.dm_3x3]
        self.dm_condensed_forms = [np.array([]), np.array([0.123]),
                                   np.array([0.01, 4.2, 12.0])]
Exemplo n.º 6
0
    def test_from_file_with_file_path(self):
        """Should identify the filepath correctly and parse from it."""

        # should fail with the expected exception
        with self.assertRaises(DissimilarityMatrixFormatError):
            DistanceMatrix.from_file(self.bad_dm_fp)

        obs = DistanceMatrix.from_file(self.dm_3x3_fp)
        self.assertEqual(self.dm_3x3, obs)
        self.assertTrue(isinstance(obs, DistanceMatrix))
Exemplo n.º 7
0
    def test_from_iterable_with_keys(self):
        iterable = (x for x in range(4))

        exp = DistanceMatrix(
            [[0, 1, 2, 3], [1, 0, 1, 2], [2, 1, 0, 1], [3, 2, 1, 0]],
            ['0', '1', '4', '9'])
        res = DistanceMatrix.from_iterable(iterable,
                                           lambda a, b: abs(b - a),
                                           keys=iter(['0', '1', '4', '9']))
        self.assertEqual(res, exp)
Exemplo n.º 8
0
    def test_from_file_with_file_path(self):
        """Should identify the filepath correctly and parse from it."""

        # should fail with the expected exception
        with self.assertRaises(DissimilarityMatrixFormatError):
            DistanceMatrix.from_file(self.bad_dm_fp)

        obs = DistanceMatrix.from_file(self.dm_3x3_fp)
        self.assertEqual(self.dm_3x3, obs)
        self.assertTrue(isinstance(obs, DistanceMatrix))
Exemplo n.º 9
0
 def test_to_series_4x4(self):
     dm = DistanceMatrix([
         [0, 0.25, 0.75, 0.75],
         [0.25, 0.0, 0.5, 0.5],
         [0.75, 0.5, 0.0, 0.0],
         [0.75, 0.5, 0.0, 0.0]], ['a', 'b', 'c', 'd'])
     series = dm.to_series()
     exp = pd.Series([0.25, 0.75, 0.75, 0.25, 0.5, 0.5, 0.75, 0.5, 0.75, 0.5],
                     index = [('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'a'), ('b', 'c'), ('b', 'd'),
                             ('c', 'a'), ('c', 'b'), ('d', 'a'), ('d', 'b')])
     assert_series_almost_equal(series, exp)
Exemplo n.º 10
0
    def test_to_series_4x4(self):
        dm = DistanceMatrix([[0.0, 0.2, 0.3, 0.4], [0.2, 0.0, 0.5, 0.6],
                             [0.3, 0.5, 0.0, 0.7], [0.4, 0.6, 0.7, 0.0]],
                            ['a', 'b', 'c', 'd'])

        series = dm.to_series()

        exp = pd.Series([0.2, 0.3, 0.4, 0.5, 0.6, 0.7],
                        index=pd.Index([('a', 'b'), ('a', 'c'), ('a', 'd'),
                                        ('b', 'c'), ('b', 'd'), ('c', 'd')]))
        assert_series_almost_equal(series, exp)
Exemplo n.º 11
0
    def setUp(self):
        data1 = [[0,  5,  9,  9,  8],
                 [5,  0, 10, 10,  9],
                 [9, 10,  0,  8,  7],
                 [9, 10,  8,  0,  3],
                 [8,  9,  7,  3,  0]]
        ids1 = list('abcde')
        self.dm1 = DistanceMatrix(data1, ids1)
        # this newick string was confirmed against http://www.trex.uqam.ca/
        # which generated the following (isomorphic) newick string:
        # (d:2.0000,e:1.0000,(c:4.0000,(a:2.0000,b:3.0000):3.0000):2.0000);
        self.expected1_str = ("(d:2.000000, (c:4.000000, (b:3.000000,"
                              " a:2.000000):3.000000):2.000000, e:1.000000);")
        self.expected1_TreeNode = TreeNode.read(StringIO(self.expected1_str))

        # this example was pulled from the Phylip manual
        # http://evolution.genetics.washington.edu/phylip/doc/neighbor.html
        data2 = [[0.0000, 1.6866, 1.7198, 1.6606, 1.5243, 1.6043, 1.5905],
                 [1.6866, 0.0000, 1.5232, 1.4841, 1.4465, 1.4389, 1.4629],
                 [1.7198, 1.5232, 0.0000, 0.7115, 0.5958, 0.6179, 0.5583],
                 [1.6606, 1.4841, 0.7115, 0.0000, 0.4631, 0.5061, 0.4710],
                 [1.5243, 1.4465, 0.5958, 0.4631, 0.0000, 0.3484, 0.3083],
                 [1.6043, 1.4389, 0.6179, 0.5061, 0.3484, 0.0000, 0.2692],
                 [1.5905, 1.4629, 0.5583, 0.4710, 0.3083, 0.2692, 0.0000]]
        ids2 = ["Bovine", "Mouse", "Gibbon", "Orang", "Gorilla", "Chimp",
                "Human"]
        self.dm2 = DistanceMatrix(data2, ids2)
        self.expected2_str = ("(Mouse:0.76891, (Gibbon:0.35793, (Orang:0.28469"
                              ", (Gorilla:0.15393, (Chimp:0.15167, Human:0.117"
                              "53):0.03982):0.02696):0.04648):0.42027, Bovine:"
                              "0.91769);")
        self.expected2_TreeNode = TreeNode.read(StringIO(self.expected2_str))

        data3 = [[0, 5, 4, 7, 6, 8],
                 [5, 0, 7, 10, 9, 11],
                 [4, 7, 0, 7, 6, 8],
                 [7, 10, 7, 0, 5, 8],
                 [6, 9, 6, 5, 0, 8],
                 [8, 11, 8, 8, 8, 0]]
        ids3 = map(str, range(6))
        self.dm3 = DistanceMatrix(data3, ids3)
        self.expected3_str = ("((((0:1.000000,1:4.000000):1.000000,2:2.000000"
                              "):1.250000,5:4.750000):0.750000,3:2.750000,4:2."
                              "250000);")
        self.expected3_TreeNode = TreeNode.read(StringIO(self.expected3_str))

        # this dm can yield negative branch lengths
        data4 = [[0,  5,  9,  9,  800],
                 [5,  0, 10, 10,  9],
                 [9, 10,  0,  8,  7],
                 [9, 10,  8,  0,  3],
                 [800,  9,  7,  3,  0]]
        ids4 = list('abcde')
        self.dm4 = DistanceMatrix(data4, ids4)
def progressive_msa_and_tree(sequences,
                             pairwise_aligner,
                             metric=kmer_distance,
                             guide_tree=None,
                             display_aln=False,
                             display_tree=False):
    """ Perform progressive msa of sequences and build a UPGMA tree
    Parameters
    ----------
    sequences : skbio.SequenceCollection
        The sequences to be aligned.
    pairwise_aligner : function
        Function that should be used to perform the pairwise alignments,
        for example skbio.alignment.global_pairwise_align_nucleotide. Must
        support skbio.Sequence objects or skbio.TabularMSA objects
        as input.
    metric : function, optional
      Function that returns a single distance value when given a pair of
      skbio.Sequence objects. This will be used to build a guide tree if one
      is not provided.
    guide_tree : skbio.TreeNode, optional
        The tree that should be used to guide the alignment process.
    display_aln : bool, optional
        Print the alignment before returning.
    display_tree : bool, optional
        Print the tree before returning.

    Returns
    -------
    skbio.alignment
    skbio.TreeNode

    """
    if guide_tree is None:
        guide_dm = DistanceMatrix.from_iterable(
                        sequences, metric=metric, key='id')
        guide_lm = average(guide_dm.condensed_form())
        guide_tree = TreeNode.from_linkage_matrix(guide_lm, guide_dm.ids)

    msa = progressive_msa(sequences, guide_tree,
                          pairwise_aligner=pairwise_aligner)

    if display_aln:
        print(msa)

    msa_dm = DistanceMatrix.from_iterable(msa, metric=metric, key='id')
    msa_lm = average(msa_dm.condensed_form())
    msa_tree = TreeNode.from_linkage_matrix(msa_lm, msa_dm.ids)
    if display_tree:
        print("\nOutput tree:")
        d = dendrogram(msa_lm, labels=msa_dm.ids, orientation='right',
                   link_color_func=lambda x: 'black', leaf_font_size=24)
    return msa, msa_tree
Exemplo n.º 13
0
    def test_fsvd(self):
        dm1 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
        dm2 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
        dm3 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))

        # Test eigh vs. fsvd pcoa and inplace parameter
        expected_results = pcoa(dm1, method="eigh", number_of_dimensions=3,
                                inplace=False)

        results = pcoa(dm2, method="fsvd", number_of_dimensions=3,
                       inplace=False)

        results_inplace = pcoa(dm2, method="fsvd", number_of_dimensions=3,
                               inplace=True)

        assert_ordination_results_equal(results, expected_results,
                                        ignore_directionality=True,
                                        ignore_method_names=True)

        assert_ordination_results_equal(results, results_inplace,
                                        ignore_directionality=True,
                                        ignore_method_names=True)

        # Test number_of_dimensions edge cases
        results2 = pcoa(dm3, method="fsvd", number_of_dimensions=0,
                        inplace=False)
        expected_results2 = pcoa(dm3, method="fsvd",
                                 number_of_dimensions=dm3.data.shape[0],
                                 inplace=False)

        assert_ordination_results_equal(results2, expected_results2,
                                        ignore_directionality=True,
                                        ignore_method_names=True)

        with self.assertRaises(ValueError):
            dim_too_large = dm1.data.shape[0] + 10
            pcoa(dm2, method="fsvd", number_of_dimensions=dim_too_large)

        with self.assertRaises(ValueError):
            pcoa(dm2, method="fsvd", number_of_dimensions=-1)

        with self.assertRaises(ValueError):
            dim_too_large = dm1.data.shape[0] + 10
            pcoa(dm2, method="eigh", number_of_dimensions=dim_too_large)

        with self.assertRaises(ValueError):
            pcoa(dm2, method="eigh", number_of_dimensions=-1)

        dm_big = DistanceMatrix.read(get_data_path('PCoA_sample_data_12dim'))
        with self.assertWarnsRegex(RuntimeWarning,
                                   "no value for number_of_dimensions"):
            pcoa(dm_big, method="fsvd", number_of_dimensions=0)
Exemplo n.º 14
0
    def test_fsvd_inplace(self):
        dm1 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
        dm2 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))

        expected_results = pcoa(dm1, method="eigh", number_of_dimensions=3,
                                inplace=True)

        results = pcoa(dm2, method="fsvd", number_of_dimensions=3,
                       inplace=True)

        assert_ordination_results_equal(results, expected_results,
                                        ignore_directionality=True,
                                        ignore_method_names=True)
Exemplo n.º 15
0
def distmat_corr(truthfile, distfile, reps=3, corrstat=spearman):
    '''Returns correlation between condensed distance matrices, using corrstat'''
    distmat = DistanceMatrix.read(distfile)
    truthmat = DistanceMatrix.read(truthfile)
    truthmat = sample_matrix_to_runs(truthmat, reps)

    ids = list(sorted(distmat.ids))
    t_ids = list(sorted(truthmat.ids))
    assert ids == t_ids, (ids, t_ids)

    dist = distmat.filter(ids).condensed_form()
    truth = truthmat.filter(ids).condensed_form()
    return corrstat(truth, dist)
Exemplo n.º 16
0
    def test_to_series_4x4(self):
        dm = DistanceMatrix([
            [0.0, 0.2, 0.3, 0.4],
            [0.2, 0.0, 0.5, 0.6],
            [0.3, 0.5, 0.0, 0.7],
            [0.4, 0.6, 0.7, 0.0]], ['a', 'b', 'c', 'd'])

        series = dm.to_series()

        exp = pd.Series([0.2, 0.3, 0.4, 0.5, 0.6, 0.7],
                        index=pd.Index([('a', 'b'), ('a', 'c'), ('a', 'd'),
                                        ('b', 'c'), ('b', 'd'), ('c', 'd')]))
        assert_series_almost_equal(series, exp)
Exemplo n.º 17
0
def distmat_corr(truthfile, distfile, reps=3, corrstat=spearman):
    '''Returns correlation between condensed distance matrices, using corrstat'''
    distmat = DistanceMatrix.read(distfile)
    truthmat = DistanceMatrix.read(truthfile)
    truthmat = sample_matrix_to_runs(truthmat, reps)

    ids = list(sorted(distmat.ids))
    t_ids = list(sorted(truthmat.ids))
    assert ids == t_ids, (ids, t_ids)

    dist = distmat.filter(ids).condensed_form()
    truth = truthmat.filter(ids).condensed_form()
    return corrstat(truth, dist)
Exemplo n.º 18
0
    def test_empty(self):
        # array of empty vectors
        actual = beta_diversity('euclidean',
                                np.array([[], []], dtype=np.int64),
                                ids=['a', 'b'])
        expected_dm = DistanceMatrix([[0.0, 0.0], [0.0, 0.0]], ['a', 'b'])
        npt.assert_array_equal(actual, expected_dm)

        actual = beta_diversity('unweighted_unifrac',
                                np.array([[], []], dtype=np.int64),
                                ids=['a', 'b'], tree=self.tree1, otu_ids=[])
        expected_dm = DistanceMatrix([[0.0, 0.0], [0.0, 0.0]], ['a', 'b'])
        self.assertEqual(actual, expected_dm)
Exemplo n.º 19
0
    def setUp(self):
        self.minx = DistanceMatrix([[0, 1, 2], [1, 0, 3], [2, 3, 0]])
        self.miny = DistanceMatrix([[0, 2, 7], [2, 0, 6], [7, 6, 0]])
        self.minz = DistanceMatrix([[0, 0.5, 0.25],
                                    [0.5, 0, 0.1],
                                    [0.25, 0.1, 0]])
        self.min_dms = (self.minx, self.miny, self.minz)

        # Versions of self.minx and self.minz (above) that each have an extra
        # ID on the end.
        self.x_extra = DistanceMatrix([[0, 1, 2, 7],
                                       [1, 0, 3, 2],
                                       [2, 3, 0, 4],
                                       [7, 2, 4, 0]], ['0', '1', '2', 'foo'])
        self.z_extra = DistanceMatrix([[0, 0.5, 0.25, 3],
                                       [0.5, 0, 0.1, 24],
                                       [0.25, 0.1, 0, 5],
                                       [3, 24, 5, 0]], ['0', '1', '2', 'bar'])

        # Load expected results. We have to load the p-value column (column
        # index 3) as a string dtype in order to compare with the in-memory
        # results since we're formatting the p-values as strings with the
        # correct number of decimal places. Without this explicit converter,
        # the p-value column will be loaded as a float dtype and the frames
        # won't compare equal.
        p_val_conv = {3: str}

        self.exp_results_minimal = pd.read_csv(
            get_data_path('pwmantel_exp_results_minimal.txt'), sep='\t',
            index_col=(0, 1), converters=p_val_conv)

        self.exp_results_minimal_with_labels = pd.read_csv(
            get_data_path('pwmantel_exp_results_minimal_with_labels.txt'),
            sep='\t', index_col=(0, 1), converters=p_val_conv)

        self.exp_results_duplicate_dms = pd.read_csv(
            get_data_path('pwmantel_exp_results_duplicate_dms.txt'),
            sep='\t', index_col=(0, 1), converters=p_val_conv)

        self.exp_results_na_p_value = pd.read_csv(
            get_data_path('pwmantel_exp_results_na_p_value.txt'),
            sep='\t', index_col=(0, 1), converters=p_val_conv)

        self.exp_results_too_few_permutations = pd.read_csv(
            get_data_path('pwmantel_exp_results_too_few_permutations.txt'),
            sep='\t', index_col=(0, 1), converters=p_val_conv)

        self.exp_results_reordered_distance_matrices = pd.read_csv(
            get_data_path('pwmantel_exp_results_reordered_distance_matrices'
                          '.txt'),
            sep='\t', index_col=(0, 1), converters=p_val_conv)
Exemplo n.º 20
0
    def test_compute_collapsed_dm(self):
        expected_data = [[0, 7, 7, 6], [7, 0, 8, 7], [7, 8, 0, 3],
                         [6, 7, 3, 0]]
        expected_ids = ['x', 'c', 'd', 'e']
        expected1 = DistanceMatrix(expected_data, expected_ids)
        self.assertEqual(_compute_collapsed_dm(self.dm1, 'a', 'b', True, 'x'),
                         expected1)

        # computed manually
        expected_data = [[0, 4, 3], [4, 0, 3], [3, 3, 0]]
        expected_ids = ['yy', 'd', 'e']
        expected2 = DistanceMatrix(expected_data, expected_ids)
        self.assertEqual(
            _compute_collapsed_dm(expected1, 'x', 'c', True, 'yy'), expected2)
Exemplo n.º 21
0
    def test_compute_q(self):
        expected_data = [[0, -50, -38, -34, -34], [-50, 0, -38, -34, -34],
                         [-38, -38, 0, -40, -40], [-34, -34, -40, 0, -48],
                         [-34, -34, -40, -48, 0]]
        expected_ids = list('abcde')
        expected = DistanceMatrix(expected_data, expected_ids)
        self.assertEqual(_compute_q(self.dm1), expected)

        data = [[0, 3, 2], [3, 0, 3], [2, 3, 0]]
        dm = DistanceMatrix(data, list('abc'))
        # computed this manually
        expected_data = [[0, -8, -8], [-8, 0, -8], [-8, -8, 0]]
        expected = DistanceMatrix(expected_data, list('abc'))
        self.assertEqual(_compute_q(dm), expected)
Exemplo n.º 22
0
    def test_distances(self):
        s1 = SequenceCollection([DNA("ACGT", "d1"), DNA("ACGG", "d2")])
        expected = [[0, 0.25], [0.25, 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2'])
        actual = s1.distances(hamming)
        self.assertEqual(actual, expected)

        # alt distance function provided
        def dumb_distance(s1, s2):
            return 42.

        expected = [[0, 42.], [42., 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2'])
        actual = s1.distances(dumb_distance)
        self.assertEqual(actual, expected)
Exemplo n.º 23
0
    def test_distances(self):
        expected = [[0, 6. / 13, 4. / 13], [6. / 13, 0, 7. / 13],
                    [4. / 13, 7. / 13, 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2', 'd3'])
        actual = self.a1.distances()
        self.assertEqual(actual, expected)

        # alt distance function provided
        def dumb_distance(s1, s2):
            return 42.

        expected = [[0, 42., 42.], [42., 0, 42.], [42., 42., 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2', 'd3'])
        actual = self.a1.distances(dumb_distance)
        self.assertEqual(actual, expected)
def guide_tree_from_sequences(sequences,
                              metric=kmer_distance,
                              display_tree=False):
    """ Build a UPGMA tree by applying metric to sequences

    Parameters
    ----------
    sequences : list of skbio.Sequence objects (or subclasses)
      The sequences to be represented in the resulting guide tree.
    metric : function
      Function that returns a single distance value when given a pair of
      skbio.Sequence objects.
    display_tree : bool, optional
      Print the tree before returning.

    Returns
    -------
    skbio.TreeNode

    """
    guide_dm = DistanceMatrix.from_iterable(sequences, metric=metric, key='id')
    guide_lm = average(guide_dm.condensed_form())
    guide_tree = to_tree(guide_lm)
    if display_tree:
        guide_d = dendrogram(guide_lm,
                             labels=guide_dm.ids,
                             orientation='right',
                             link_color_func=lambda x: 'black')
    return guide_tree
Exemplo n.º 25
0
def aln_distmat(alignment, reps=3):
    '''Calculate pairwise distances from a MSA of genomes'''
    aln = TabularMSA.read(alignment, constructor=DNA)
    aln.reassign_index(minter="id")
    dist = DistanceMatrix.from_iterable([seq.values for seq in aln],
                                        metric=hamming, keys=aln.index)
    return dist
Exemplo n.º 26
0
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata) -> None:
    # convert metadata to numeric values where applicable, drop the non-numeric
    # values, and then drop samples that contain NaNs
    df = metadata.to_dataframe()
    df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))

    # filter categorical columns
    pre_filtered_cols = set(df.columns)
    df = df.select_dtypes([numpy.number]).dropna()
    filtered_categorical_cols = pre_filtered_cols - set(df.columns)

    # filter 0 variance numerical columns
    pre_filtered_cols = set(df.columns)
    df = df.loc[:, df.var() != 0]
    filtered_zero_variance_cols = pre_filtered_cols - set(df.columns)

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(df.index, strict=False)
    filtered_dm_length = distance_matrix.shape[0]

    result = skbio.stats.distance.bioenv(distance_matrix, df)
    result = result.to_html(classes='table table-striped table-hover').replace(
        'border="1"', 'border="0"')

    index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'filtered_categorical_cols': ', '.join(filtered_categorical_cols),
        'filtered_zero_variance_cols': ', '.join(filtered_zero_variance_cols),
        'result': result})
    def test_simple(self):
        eigvals = [0.51236726, 0.30071909, 0.26791207, 0.20898868,
                   0.19169895, 0.16054235,  0.15017696,  0.12245775,
                   0.0]
        proportion_explained = [0.2675738328, 0.157044696, 0.1399118638,
                                0.1091402725, 0.1001110485,
                                0.0838401162, 0.0784269939,
                                0.0639511764, 0.0]
        sample_ids = ['PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354',
                      'PC.593', 'PC.355', 'PC.607', 'PC.634']
        axis_labels = ['PC%d' % i for i in range(1, 10)]

        expected_results = OrdinationResults(
            short_method_name='PCoA',
            long_method_name='Principal Coordinate Analysis',
            eigvals=pd.Series(eigvals, index=axis_labels),
            samples=pd.DataFrame(
                np.loadtxt(get_data_path('exp_PCoAEigenResults_site')),
                index=sample_ids, columns=axis_labels),
            proportion_explained=pd.Series(proportion_explained,
                                           index=axis_labels))

        dm = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
        results = pcoa(dm)

        assert_ordination_results_equal(results, expected_results,
                                        ignore_directionality=True)
Exemplo n.º 28
0
def drawTree(MS_distDict, Methyl_distDict, filtered_samples, ratio, outgroup):
    '''
    Merge MS and Methyl distance matrices
    '''
    merged_distMatrix = []
    for sample1 in sorted(filtered_samples):
        sample1_dist = []
        for sample2 in sorted(filtered_samples):
            merged_dist = (MS_distDict[sample1][sample2] * ratio) + (
                Methyl_distDict[sample1][sample2] * (1 - ratio)
            ) / 100  #We want to scale methyl PD dist properly because PD is calculated from a 0-100 scale while MS dist is 0-1 scale
            sample1_dist.append(merged_dist)
        merged_distMatrix.append(sample1_dist)
    '''
    Run neighbor-joining phylogenetic tree building algorithm on pairwise cell distance (saved in distDict)
    '''
    distObj = DistanceMatrix(merged_distMatrix, sorted(filtered_samples))
    print(distObj.data)
    skbio_tree = nj(distObj, result_constructor=str)
    ete_tree = Tree(
        skbio_tree
    )  #We use skbio to first make a tree from distance matrix then convert to ete tree
    if outgroup is "NA":
        return ete_tree
    else:
        if outgroup == "Midpoint":
            tree_midpoint = ete_tree.get_midpoint_outgroup()
            ete_tree.set_outgroup(tree_midpoint)
        else:
            ete_tree.set_outgroup(outgroup)
    return ete_tree
Exemplo n.º 29
0
    def test_confirm_betadispr_results(self):
        mp_dm = DistanceMatrix.read(get_data_path('moving_pictures_dm.tsv'))
        mp_mf = pd.read_csv(get_data_path('moving_pictures_mf.tsv'), sep='\t')
        mp_mf.set_index('#SampleID', inplace=True)

        obs_med_mp = permdisp(mp_dm, mp_mf,
                              column='BodySite')
        obs_cen_mp = permdisp(mp_dm, mp_mf, column='BodySite',
                              test='centroid')

        exp_data_m = ['PERMDISP', 'F-value', 33, 4, 10.1956, 0.001, 999]
        exp_data_c = ['PERMDISP', 'F-value', 33, 4, 17.4242, 0.001, 999]
        exp_ind = ['method name', 'test statistic name', 'sample size',
                   'number of groups', 'test statistic', 'p-value',
                   'number of permutations']

        exp_med_mp = pd.Series(data=exp_data_m, index=exp_ind, dtype='object',
                               name='PERMDISP results')

        exp_cen_mp = pd.Series(data=exp_data_c, index=exp_ind, dtype='object',
                               name='PERMDISP results')

        self.assert_series_equal(exp_med_mp, obs_med_mp)

        self.assert_series_equal(exp_cen_mp, obs_cen_mp)
Exemplo n.º 30
0
def effect_size(mappings, alphas, betas, output, jobs, permutations,
                overwrite, na_values):
    # As we can have multiple mapping, alpha or beta files, we will construct
    # a mfs dictionary with all the dataframes. Additionally, we will load the
    # data_dictionary.csv file so we can use it to process the data
    mappings = {f: pd.read_csv(f, sep='\t', dtype=str, na_values=na_values)
                for f in mappings}
    for m, mf in mappings.items():
        mappings[m].set_index('#SampleID', inplace=True)
    if betas:
        betas = {f: DistanceMatrix.read(f) for f in betas}

        with joblib.parallel.Parallel(n_jobs=jobs, verbose=100) as par:
            par(joblib.delayed(
                _process_column)(bf, c, fname, finfo, alphas, betas,
                                 permutations)
                for bf, c, fname, finfo in _generate_betas(
                betas, mappings, permutations, output, overwrite))
    else:
        alphas = {f: pd.read_csv(f, sep='\t', dtype=str, na_values=na_values)
                  for f in alphas}
        for a, af in alphas.items():
            alphas[a].set_index('#SampleID', inplace=True)

        for af, c, fname, finfo in _generate_alphas(alphas, mappings,
                                                    output, overwrite):
            _process_column(af, c, fname, finfo, alphas, betas, permutations)
    def setUp(self):
        # Crawford dataset for unweighted UniFrac
        fp = get_data_path('PCoA_sample_data_3')
        self.ordination = pcoa(DistanceMatrix.read(fp))

        fp = get_data_path('PCoA_biplot_descriptors')
        self.descriptors = pd.read_table(fp, index_col='Taxon').T
Exemplo n.º 32
0
    def test_heatmap_extra_tips(self):
        # Adds in test scenario where there more tips than features
        # in the table
        np.random.seed(0)
        num_otus = 11  # otus
        index = np.arange(5).astype(np.str)
        table = pd.DataFrame(np.random.random((len(index), num_otus)),
                             index=index,
                             columns=np.arange(num_otus).astype(np.str))

        x = np.random.rand(num_otus*2)
        dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y))
        lm = ward(dm.condensed_form())
        t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str))

        for i, n in enumerate(t.postorder()):
            if not n.is_tip():
                n.name = "y%d" % i
            n.length = np.random.rand()*3

        md = MetadataCategory(
            pd.Series(['a', 'a', 'a', 'b', 'b'], index=index))

        dendrogram_heatmap(self.results, table, t, md)

        index_fp = os.path.join(self.results, 'index.html')
        self.assertTrue(os.path.exists(index_fp))

        with open(index_fp, 'r') as fh:
            html = fh.read()
            self.assertIn('<h1>Dendrogram heatmap</h1>',
                          html)
    def setUp(self):
        # Crawford dataset for unweighted UniFrac
        fp = get_data_path('PCoA_sample_data_3')
        self.ordination = pcoa(DistanceMatrix.read(fp))

        fp = get_data_path('PCoA_biplot_descriptors')
        self.descriptors = pd.read_table(fp, index_col='Taxon').T
Exemplo n.º 34
0
    def fromSequences(cls, labels, sequences, findParams=None, **kwargs):
        """
        Construct an NJTree instance from some seqeunces.

        @param cls: Our class.
        @param labels: An iterable producing C{str} labels for the sequences.
        @param sequences: Either A C{str} filename of sequences to consider or
            a C{light.reads.Reads} instance.
        @param findParams: An instance of C{FindParameters}.
        @param kwargs: See
            C{database.DatabaseSpecifier.getDatabaseFromKeywords} for
            additional keywords, all of which are optional.
        @return: An C{NJTree} instance.
        """
        if isinstance(sequences, str):
            sequences = FastaReads(sequences,
                                   readClass=AAReadWithX,
                                   upperCase=True)

        new = cls()
        new.sequences = list(sequences)
        new.labels = labels
        findParams = findParams or FindParameters()
        affinity = np.array(
            affinityMatrix(new.sequences, findParams=findParams, **kwargs))
        new.distance = np.ones(affinity.shape) - affinity
        new.tree = nj(DistanceMatrix(new.distance, labels))
        return new
Exemplo n.º 35
0
def effect_size(mappings, alphas, betas, output, jobs, permutations, overwrite,
                na_values):
    # As we can have multiple mapping, alpha or beta files, we will construct
    # a mfs dictionary with all the dataframes. Additionally, we will load the
    # data_dictionary.csv file so we can use it to process the data
    mappings = {
        f: pd.read_csv(f, sep='\t', dtype=str, na_values=na_values)
        for f in mappings
    }
    for m, mf in mappings.items():
        mappings[m].set_index('#SampleID', inplace=True)
    if betas:
        betas = {f: DistanceMatrix.read(f) for f in betas}

        with joblib.parallel.Parallel(n_jobs=jobs, verbose=100) as par:
            par(
                joblib.delayed(_process_column)(bf, c, fname, finfo, alphas,
                                                betas, permutations)
                for bf, c, fname, finfo in _generate_betas(
                    betas, mappings, permutations, output, overwrite))
    else:
        alphas = {
            f: pd.read_csv(f, sep='\t', dtype=str, na_values=na_values)
            for f in alphas
        }
        for a, af in alphas.items():
            alphas[a].set_index('#SampleID', inplace=True)

        for af, c, fname, finfo in _generate_alphas(alphas, mappings, output,
                                                    overwrite):
            _process_column(af, c, fname, finfo, alphas, betas, permutations)
def guide_tree_from_sequences(sequences,
                              metric=kmer_distance,
                              display_tree = False):
    """ Build a UPGMA tree by applying metric to sequences

    Parameters
    ----------
    sequences : list of skbio.Sequence objects (or subclasses)
      The sequences to be represented in the resulting guide tree.
    metric : function
      Function that returns a single distance value when given a pair of
      skbio.Sequence objects.
    display_tree : bool, optional
      Print the tree before returning.

    Returns
    -------
    skbio.TreeNode

    """
    guide_dm = DistanceMatrix.from_iterable(
                    sequences, metric=metric, key='id')
    guide_lm = average(guide_dm.condensed_form())
    guide_tree = to_tree(guide_lm)
    if display_tree:
        guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right',
               link_color_func=lambda x: 'black')
    return guide_tree
Exemplo n.º 37
0
    def test_confirm_betadispr_results(self):
        mp_dm = DistanceMatrix.read(get_data_path('moving_pictures_dm.tsv'))
        mp_mf = pd.read_csv(get_data_path('moving_pictures_mf.tsv'), sep='\t')
        mp_mf.set_index('#SampleID', inplace=True)

        obs_med_mp = permdisp(mp_dm, mp_mf,
                              column='BodySite')
        obs_cen_mp = permdisp(mp_dm, mp_mf, column='BodySite',
                              test='centroid')

        exp_data_m = ['PERMDISP', 'F-value', 33, 4, 10.1956, 0.001, 999]
        exp_data_c = ['PERMDISP', 'F-value', 33, 4, 17.4242, 0.001, 999]
        exp_ind = ['method name', 'test statistic name', 'sample size',
                   'number of groups', 'test statistic', 'p-value',
                   'number of permutations']

        exp_med_mp = pd.Series(data=exp_data_m, index=exp_ind, dtype='object',
                               name='PERMDISP results')

        exp_cen_mp = pd.Series(data=exp_data_c, index=exp_ind, dtype='object',
                               name='PERMDISP results')

        self.assert_series_equal(exp_med_mp, obs_med_mp)

        self.assert_series_equal(exp_cen_mp, obs_cen_mp)
Exemplo n.º 38
0
    def __init__(self, dist_matrix):
        self.dist_matrix = dist_matrix
        nr_elements = self.dist_matrix.nr_elements
        self.matrix = []
        for i in range(nr_elements):
            row = []
            for j in range(nr_elements):
                row.append(self.dist_matrix.get_distance(i, j))
            self.matrix.append(row)
        self.ids = list(map(str, self.dist_matrix.labels))
        self.nj_dm = DistanceMatrix(self.matrix, self.ids)
        tree = nj(self.nj_dm)
        self.ids = []
        self.sources = []
        self.targets = []
        self.weights = []
        self.colors = []
        self.node_size = []
        self.virtual_nodes = 0
        self.shown_labels = {}
        self.font_colors = []

        # true #00A693 -- false #CC3333
        for node in tree.preorder():
            name_str = ''
            if node.name is None:
                self.virtual_nodes = self.virtual_nodes + 1
                name_str = 'v' + str(self.virtual_nodes)
                node.name = name_str
                self.ids.append(node.name)
                self.colors.append("black")
                self.node_size.append(20)
                self.shown_labels[str(name_str)] = ""
                self.font_colors.append('k')
            else:
                name = node.name.rsplit(' ', 1)
                if len(name) > 1:
                    node.name = name[1]
                    name2 = name[0].rsplit(' ', 1)
                    if len(name2) > 1:
                        node.name = name2[1] + name[1]
                name = node.name
                if name in []:
                    self.ids.append(node.name)
                    self.colors.append("#CC3333")
                    self.node_size.append(800)
                    name_str = node.name
                    self.shown_labels[str(name_str)] = name_str
                else:
                    self.ids.append(node.name)
                    self.colors.append("#00A693")
                    self.node_size.append(800)
                    name_str = node.name
                    self.shown_labels[str(name_str)] = name_str

        for node in tree.preorder():
            for child in node.children:
                self.sources.append(str(node.name))
                self.targets.append(str(child.name))
                self.weights.append(str(child.length))
Exemplo n.º 39
0
 def test_varmat1(self):
     X = pd.DataFrame({'x': np.arange(1, 10), 'y': np.arange(2, 11)})
     res = variation_matrix(X)
     exp = DistanceMatrix(
         [[0, 0.032013010420979787 / 2], [0.032013010420979787 / 2, 0]],
         ids=['x', 'y'])
     self.assertEqual(str(res), str(exp))
Exemplo n.º 40
0
    def test_euclidean(self):
        # TODO: update npt.assert_almost_equal calls to use DistanceMatrix
        # near-equality testing when that support is available
        actual_dm = beta_diversity('euclidean', self.table1, self.sids1)
        self.assertEqual(actual_dm.shape, (3, 3))
        npt.assert_almost_equal(actual_dm['A', 'A'], 0.0)
        npt.assert_almost_equal(actual_dm['B', 'B'], 0.0)
        npt.assert_almost_equal(actual_dm['C', 'C'], 0.0)
        npt.assert_almost_equal(actual_dm['A', 'B'], 2.23606798)
        npt.assert_almost_equal(actual_dm['B', 'A'], 2.23606798)
        npt.assert_almost_equal(actual_dm['A', 'C'], 4.12310563)
        npt.assert_almost_equal(actual_dm['C', 'A'], 4.12310563)
        npt.assert_almost_equal(actual_dm['B', 'C'], 2.82842712)
        npt.assert_almost_equal(actual_dm['C', 'B'], 2.82842712)

        actual_dm = beta_diversity('euclidean', self.table2, self.sids2)
        expected_data = [
            [0., 80.8455317, 84.0297566, 36.3042697, 86.0116271, 78.9176786],
            [80.8455317, 0., 71.0844568, 74.4714710, 69.3397433, 14.422205],
            [84.0297566, 71.0844568, 0., 77.2851861, 8.3066238, 60.7536007],
            [36.3042697, 74.4714710, 77.2851861, 0., 78.7908624, 70.7389567],
            [86.0116271, 69.3397433, 8.3066238, 78.7908624, 0., 58.4807660],
            [78.9176786, 14.422205, 60.7536007, 70.7389567, 58.4807660, 0.]
        ]
        expected_dm = DistanceMatrix(expected_data, self.sids2)
        for id1 in self.sids2:
            for id2 in self.sids2:
                npt.assert_almost_equal(actual_dm[id1, id2],
                                        expected_dm[id1, id2], 6)
    def test_simple(self):
        eigvals = [
            0.51236726, 0.30071909, 0.26791207, 0.20898868, 0.19169895,
            0.16054235, 0.15017696, 0.12245775, 0.0
        ]
        proportion_explained = [
            0.2675738328, 0.157044696, 0.1399118638, 0.1091402725,
            0.1001110485, 0.0838401162, 0.0784269939, 0.0639511764, 0.0
        ]
        sample_ids = [
            'PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593',
            'PC.355', 'PC.607', 'PC.634'
        ]
        axis_labels = ['PC%d' % i for i in range(1, 10)]

        expected_results = OrdinationResults(
            short_method_name='PCoA',
            long_method_name='Principal Coordinate Analysis',
            eigvals=pd.Series(eigvals, index=axis_labels),
            samples=pd.DataFrame(np.loadtxt(
                get_data_path('exp_PCoAEigenResults_site')),
                                 index=sample_ids,
                                 columns=axis_labels),
            proportion_explained=pd.Series(proportion_explained,
                                           index=axis_labels))

        dm = DistanceMatrix.read(get_data_path('PCoA_sample_data_3'))
        results = pcoa(dm)

        assert_ordination_results_equal(results,
                                        expected_results,
                                        ignore_directionality=True)
Exemplo n.º 42
0
    def test_weighted_unifrac_partial_full(self):
        # TODO: update npt.assert_almost_equal calls to use DistanceMatrix
        # near-equality testing when that support is available
        # expected values calculated by hand
        dm1 = partial_beta_diversity('weighted_unifrac',
                                     self.table1,
                                     self.sids1,
                                     otu_ids=self.oids1,
                                     tree=self.tree1,
                                     id_pairs=[('A', 'B'), ('A', 'C'),
                                               ('B', 'C')])
        dm2 = beta_diversity('weighted_unifrac',
                             self.table1,
                             self.sids1,
                             otu_ids=self.oids1,
                             tree=self.tree1)

        self.assertEqual(dm1.shape, (3, 3))
        self.assertEqual(dm1, dm2)
        expected_data = [[0.0, 0.1750000, 0.12499999],
                         [0.1750000, 0.0, 0.3000000],
                         [0.12499999, 0.3000000, 0.0]]
        expected_dm = DistanceMatrix(expected_data, ids=self.sids1)
        for id1 in self.sids1:
            for id2 in self.sids1:
                npt.assert_almost_equal(dm1[id1, id2], expected_dm[id1, id2],
                                        6)
Exemplo n.º 43
0
 def test_weighted_unifrac_normalized(self):
     # TODO: update npt.assert_almost_equal calls to use DistanceMatrix
     # near-equality testing when that support is available
     # expected values calculated by hand
     dm1 = beta_diversity('weighted_unifrac',
                          self.table1,
                          self.sids1,
                          otu_ids=self.oids1,
                          tree=self.tree1,
                          normalized=True)
     dm2 = beta_diversity(weighted_unifrac,
                          self.table1,
                          self.sids1,
                          otu_ids=self.oids1,
                          tree=self.tree1,
                          normalized=True)
     self.assertEqual(dm1.shape, (3, 3))
     self.assertEqual(dm1, dm2)
     expected_data = [[0.0, 0.128834, 0.085714], [0.128834, 0.0, 0.2142857],
                      [0.085714, 0.2142857, 0.0]]
     expected_dm = DistanceMatrix(expected_data, ids=self.sids1)
     for id1 in self.sids1:
         for id2 in self.sids1:
             npt.assert_almost_equal(dm1[id1, id2], expected_dm[id1, id2],
                                     6)
Exemplo n.º 44
0
    def test_braycurtis(self):
        # TODO: update npt.assert_almost_equal calls to use DistanceMatrix
        # near-equality testing when that support is available
        actual_dm = beta_diversity('braycurtis', self.table1, self.sids1)
        self.assertEqual(actual_dm.shape, (3, 3))
        npt.assert_almost_equal(actual_dm['A', 'A'], 0.0)
        npt.assert_almost_equal(actual_dm['B', 'B'], 0.0)
        npt.assert_almost_equal(actual_dm['C', 'C'], 0.0)
        npt.assert_almost_equal(actual_dm['A', 'B'], 0.27272727)
        npt.assert_almost_equal(actual_dm['B', 'A'], 0.27272727)
        npt.assert_almost_equal(actual_dm['A', 'C'], 0.71428571)
        npt.assert_almost_equal(actual_dm['C', 'A'], 0.71428571)
        npt.assert_almost_equal(actual_dm['B', 'C'], 0.66666667)
        npt.assert_almost_equal(actual_dm['C', 'B'], 0.66666667)

        actual_dm = beta_diversity('braycurtis', self.table2, self.sids2)
        expected_data = [
            [0., 0.78787879, 0.86666667, 0.30927835, 0.85714286, 0.81521739],
            [0.78787879, 0., 0.78142077, 0.86813187, 0.75, 0.1627907],
            [0.86666667, 0.78142077, 0., 0.87709497, 0.09392265, 0.71597633],
            [0.30927835, 0.86813187, 0.87709497, 0., 0.87777778, 0.89285714],
            [0.85714286, 0.75, 0.09392265, 0.87777778, 0., 0.68235294],
            [0.81521739, 0.1627907, 0.71597633, 0.89285714, 0.68235294, 0.]
        ]
        expected_dm = DistanceMatrix(expected_data, self.sids2)
        for id1 in self.sids2:
            for id2 in self.sids2:
                npt.assert_almost_equal(actual_dm[id1, id2],
                                        expected_dm[id1, id2], 6)
Exemplo n.º 45
0
    def test_visualization_garbage_metadata(self):
        # tests the scenario where ndim > number of tips
        np.random.seed(0)
        num_otus = 10  # otus
        num_samples = 5
        table = pd.DataFrame(np.random.random((num_samples, num_otus)),
                             index=np.arange(num_samples).astype(np.str),
                             columns=np.arange(num_otus).astype(np.str))

        x = np.random.rand(num_otus)
        dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y))
        lm = ward(dm.condensed_form())
        t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str))

        for i, n in enumerate(t.postorder()):
            if not n.is_tip():
                n.name = "y%d" % i
            n.length = np.random.rand()*3

        md = MetadataCategory(
            pd.Series(['a', 'a', 'a', 'b', 'b', 'foo', 'foo'],
                      index=np.arange(7).astype(np.str)))

        dendrogram_heatmap(self.results, table, t, md)

        index_fp = os.path.join(self.results, 'index.html')
        self.assertTrue(os.path.exists(index_fp))

        with open(index_fp, 'r') as fh:
            html = fh.read()
            self.assertIn('<h1>Dendrogram heatmap</h1>',
                          html)
Exemplo n.º 46
0
    def setup(self):
        with open(get_data_path('PCoA_sample_data_3'), 'U') as lines:
            dist_matrix = DistanceMatrix.from_file(lines)

        self.ordination = PCoA(dist_matrix)

        self.ids = ['PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593',
                    'PC.355', 'PC.607', 'PC.634']
Exemplo n.º 47
0
def get_spearmans(distfile, truth):
    distmat = DistanceMatrix.read(distfile)
    ids = list(sorted(distmat.ids))
    distmat = distmat.filter(ids)
    dist  = distmat.condensed_form()
    truth = truth.condensed_form()
    sp = stats.spearmanr(truth, dist)
    return sp.correlation
Exemplo n.º 48
0
 def test_from_iterable_validate_false_non_symmetric(self):
     exp = DistanceMatrix([[0, 1, 2, 3],
                           [1, 0, 1, 2],
                           [2, 1, 0, 1],
                           [3, 2, 1, 0]])
     res = DistanceMatrix.from_iterable((x for x in range(4)),
                                        lambda a, b: a - b,
                                        validate=False)
     self.assertEqual(res, exp)
Exemplo n.º 49
0
    def test_from_iterable_no_key(self):
        iterable = (x for x in range(4))

        exp = DistanceMatrix([[0, 1, 2, 3],
                              [1, 0, 1, 2],
                              [2, 1, 0, 1],
                              [3, 2, 1, 0]])
        res = DistanceMatrix.from_iterable(iterable, lambda a, b: abs(b - a))
        self.assertEqual(res, exp)
Exemplo n.º 50
0
 def test_unweighted_unifrac_qiime_tiny_test(self):
     dm_fp = get_data_path(
         os.path.join('qiime-191-tt', 'unweighted_unifrac_dm.txt'), 'data')
     expected = DistanceMatrix.read(dm_fp)
     for sid1 in self.q_table.columns:
         for sid2 in self.q_table.columns:
             actual = unweighted_unifrac(
                 self.q_table[sid1], self.q_table[sid2],
                 otu_ids=self.q_table.index, tree=self.q_tree)
             self.assertAlmostEqual(actual, expected[sid1, sid2])
Exemplo n.º 51
0
 def test_io(self):
     # Very basic check that read/write public API is present and appears to
     # be functioning. Roundtrip from memory -> disk -> memory and ensure
     # results match.
     fh = StringIO()
     self.dm_3x3.write(fh)
     fh.seek(0)
     deserialized = DistanceMatrix.read(fh)
     self.assertEqual(deserialized, self.dm_3x3)
     self.assertTrue(type(deserialized) == DistanceMatrix)
Exemplo n.º 52
0
    def test_from_iterable_with_keys(self):
        iterable = (x for x in range(4))

        exp = DistanceMatrix([[0, 1, 2, 3],
                              [1, 0, 1, 2],
                              [2, 1, 0, 1],
                              [3, 2, 1, 0]], ['0', '1', '4', '9'])
        res = DistanceMatrix.from_iterable(iterable, lambda a, b: abs(b - a),
                                           keys=iter(['0', '1', '4', '9']))
        self.assertEqual(res, exp)
Exemplo n.º 53
0
    def setUp(self):
        np.random.seed(0)
        x = np.random.rand(10)
        dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y))
        lm = ward(dm.condensed_form())
        ids = np.arange(len(x)).astype(np.str)
        self.tree = TreeNode.from_linkage_matrix(lm, ids)

        # initialize tree with branch length and named internal nodes
        for i, n in enumerate(self.tree.postorder(include_self=True)):
            n.length = 1
            if not n.is_tip():
                n.name = "y%d" % i
Exemplo n.º 54
0
    def test_varmat_larg(self):
        np.random.seed(123)
        D = 50
        N = 100
        mean = np.ones(D)*10
        cov = np.eye(D)
        X = pd.DataFrame(np.abs(np.random.multivariate_normal(mean, cov,
                                                              size=N)),
                         columns=np.arange(D).astype(np.str))
        res = variation_matrix(X)

        exp = DistanceMatrix.read(get_data_path('exp_varmat.txt'))
        self.assertEqual(str(res), str(exp))
Exemplo n.º 55
0
    def setUp(self):
        np.random.seed(0)
        self.table = pd.DataFrame(np.random.random((5, 5)))
        num_otus = 5  # otus
        x = np.random.rand(num_otus)
        dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y))
        lm = ward(dm.condensed_form())
        t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str))
        self.tree = SquareDendrogram.from_tree(t)

        for i, n in enumerate(t.postorder()):
            if not n.is_tip():
                n.name = "y%d" % i
            n.length = np.random.rand()*3
Exemplo n.º 56
0
def adonis(output_dir: str,
           distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata,
           formula: str,
           permutations: int = 999,
           n_jobs: str = 1) -> None:
    # Validate sample metadata is superset et cetera
    metadata_ids = set(metadata.ids)
    dm_ids = distance_matrix.ids
    _validate_metadata_is_superset(metadata_ids, set(dm_ids))
    # filter ids. ids must be in same order as dm
    filtered_md = metadata.to_dataframe().reindex(dm_ids)
    filtered_md.index.name = 'sample-id'
    metadata = qiime2.Metadata(filtered_md)

    # Validate formula
    terms = ModelDesc.from_formula(formula)
    for t in terms.rhs_termlist:
        for i in t.factors:
            metadata.get_column(i.name())

    # Run adonis
    results_fp = os.path.join(output_dir, 'adonis.tsv')
    with tempfile.TemporaryDirectory() as temp_dir_name:
        dm_fp = os.path.join(temp_dir_name, 'dm.tsv')
        distance_matrix.write(dm_fp)
        md_fp = os.path.join(temp_dir_name, 'md.tsv')
        metadata.save(md_fp)
        cmd = ['run_adonis.R', dm_fp, md_fp, formula, str(permutations),
               str(n_jobs), results_fp]
        _run_command(cmd)

    # Visualize results
    results = pd.read_csv(results_fp, sep='\t')
    results = q2templates.df_to_html(results)
    index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html')
    q2templates.render(index, output_dir, context={'results': results})
Exemplo n.º 57
0
def filter_distance_matrix(distance_matrix: skbio.DistanceMatrix,
                           metadata: qiime2.Metadata,
                           where: str=None,
                           exclude_ids: bool=False) -> skbio.DistanceMatrix:
    ids_to_keep = metadata.ids(where=where)
    if exclude_ids:
        ids_to_keep = set(distance_matrix.ids) - set(ids_to_keep)
    # NOTE: there is no guaranteed ordering to output distance matrix because
    # `ids_to_keep` is a set, and `DistanceMatrix.filter` uses its iteration
    # order.
    try:
        return distance_matrix.filter(ids_to_keep, strict=False)
    except skbio.stats.distance.DissimilarityMatrixError:
        raise ValueError(
            "All samples were filtered out of the distance matrix.")
Exemplo n.º 58
0
def rank_linkage(r, method='average'):
    r""" Hierchical Clustering on feature ranks.

    The hierarchy is built based on the rank values of the features given
    an input vector `r` of ranks. The distance between two features :math:`x`
    and :math:`y` can be defined as

    .. math::
       d(x, y) = (r(x) - r(y))^2

    Where :math:`r(x)` is the rank of the features.  Hierarchical clustering is
    then performed using :math:`d(x, y)` as the distance metric.

    This can be useful for constructing principal balances.

    Parameters
    ----------
    r : pd.Series
        Continuous vector representing some ordering of the features in X.
    method : str
        Clustering method.  (default='average')

    Returns
    -------
    skbio.TreeNode
        Tree for constructing principal balances.

    Examples
    --------
    >>> import pandas as pd
    >>> from gneiss.cluster import rank_linkage
    >>> ranks = pd.Series([1, 2, 4, 5],
    ...                   index=['o1', 'o2', 'o3', 'o4'])
    >>> tree = rank_linkage(ranks)
    >>> print(tree.ascii_art())
                        /-o1
              /y1------|
             |          \-o2
    -y0------|
             |          /-o3
              \y2------|
                        \-o4
    """
    dm = DistanceMatrix.from_iterable(r, euclidean)
    lm = linkage(dm.condensed_form(), method)
    t = TreeNode.from_linkage_matrix(lm, r.index)
    t = rename_internal_nodes(t)
    return t