예제 #1
0
 def test_input_types(self):
     actual_array = beta_diversity('euclidean',
                                   np.array([[1, 5], [2, 3]]),
                                   ids=['a', 'b'])
     actual_list = beta_diversity('euclidean', [[1, 5], [2, 3]],
                                  ids=['a', 'b'])
     self.assertEqual(actual_array, actual_list)
예제 #2
0
    def test_euclidean(self):
        # TODO: update npt.assert_almost_equal calls to use DistanceMatrix
        # near-equality testing when that support is available
        actual_dm = beta_diversity('euclidean', self.table1, self.sids1)
        self.assertEqual(actual_dm.shape, (3, 3))
        npt.assert_almost_equal(actual_dm['A', 'A'], 0.0)
        npt.assert_almost_equal(actual_dm['B', 'B'], 0.0)
        npt.assert_almost_equal(actual_dm['C', 'C'], 0.0)
        npt.assert_almost_equal(actual_dm['A', 'B'], 2.23606798)
        npt.assert_almost_equal(actual_dm['B', 'A'], 2.23606798)
        npt.assert_almost_equal(actual_dm['A', 'C'], 4.12310563)
        npt.assert_almost_equal(actual_dm['C', 'A'], 4.12310563)
        npt.assert_almost_equal(actual_dm['B', 'C'], 2.82842712)
        npt.assert_almost_equal(actual_dm['C', 'B'], 2.82842712)

        actual_dm = beta_diversity('euclidean', self.table2, self.sids2)
        expected_data = [
            [0., 80.8455317, 84.0297566, 36.3042697, 86.0116271, 78.9176786],
            [80.8455317, 0., 71.0844568, 74.4714710, 69.3397433, 14.422205],
            [84.0297566, 71.0844568, 0., 77.2851861, 8.3066238, 60.7536007],
            [36.3042697, 74.4714710, 77.2851861, 0., 78.7908624, 70.7389567],
            [86.0116271, 69.3397433, 8.3066238, 78.7908624, 0., 58.4807660],
            [78.9176786, 14.422205, 60.7536007, 70.7389567, 58.4807660, 0.]
        ]
        expected_dm = DistanceMatrix(expected_data, self.sids2)
        for id1 in self.sids2:
            for id2 in self.sids2:
                npt.assert_almost_equal(actual_dm[id1, id2],
                                        expected_dm[id1, id2], 6)
예제 #3
0
 def test_input_types(self):
     actual_array = beta_diversity('euclidean',
                                   np.array([[1, 5], [2, 3]]),
                                   ids=['a', 'b'])
     actual_list = beta_diversity('euclidean',
                                  [[1, 5], [2, 3]], ids=['a', 'b'])
     self.assertEqual(actual_array, actual_list)
예제 #4
0
 def test_qualitative_bug_issue_1549(self):
     mat = np.array([[42, 0, 37, 99, 1], [12, 1, 22, 88, 0],
                     [25, 3, 23, 86, 0], [0, 0, 87, 12, 0]])
     as_presence_absence = mat > 0
     obs_mat = beta_diversity('jaccard', mat)
     obs_presence_absence = beta_diversity('jaccard', as_presence_absence)
     self.assertEqual(obs_mat, obs_presence_absence)
예제 #5
0
    def test_alt_pairwise_func(self):
        # confirm that pairwise_func is actually being used
        def not_a_real_pdist(counts, metric):
            return [[0.0, 42.0], [42.0, 0.0]]
        dm1 = beta_diversity('unweighted_unifrac', self.table1,
                             otu_ids=self.oids1, tree=self.tree1,
                             pairwise_func=not_a_real_pdist)
        expected = DistanceMatrix([[0.0, 42.0], [42.0, 0.0]])
        self.assertEqual(dm1, expected)

        dm1 = beta_diversity('weighted_unifrac', self.table1,
                             otu_ids=self.oids1, tree=self.tree1,
                             pairwise_func=not_a_real_pdist)
        expected = DistanceMatrix([[0.0, 42.0], [42.0, 0.0]])
        self.assertEqual(dm1, expected)

        dm1 = beta_diversity(unweighted_unifrac, self.table1,
                             otu_ids=self.oids1, tree=self.tree1,
                             pairwise_func=not_a_real_pdist)
        expected = DistanceMatrix([[0.0, 42.0], [42.0, 0.0]])
        self.assertEqual(dm1, expected)

        dm1 = beta_diversity("euclidean", self.table1,
                             pairwise_func=not_a_real_pdist)
        expected = DistanceMatrix([[0.0, 42.0], [42.0, 0.0]])
        self.assertEqual(dm1, expected)
예제 #6
0
    def test_alt_pairwise_func(self):
        # confirm that pairwise_func is actually being used
        def not_a_real_pdist(counts, metric):
            return [[0.0, 42.0], [42.0, 0.0]]
        dm1 = beta_diversity('unweighted_unifrac', self.table1,
                             otu_ids=self.oids1, tree=self.tree1,
                             pairwise_func=not_a_real_pdist)
        expected = DistanceMatrix([[0.0, 42.0], [42.0, 0.0]])
        self.assertEqual(dm1, expected)

        dm1 = beta_diversity('weighted_unifrac', self.table1,
                             otu_ids=self.oids1, tree=self.tree1,
                             pairwise_func=not_a_real_pdist)
        expected = DistanceMatrix([[0.0, 42.0], [42.0, 0.0]])
        self.assertEqual(dm1, expected)

        dm1 = beta_diversity(unweighted_unifrac, self.table1,
                             otu_ids=self.oids1, tree=self.tree1,
                             pairwise_func=not_a_real_pdist)
        expected = DistanceMatrix([[0.0, 42.0], [42.0, 0.0]])
        self.assertEqual(dm1, expected)

        dm1 = beta_diversity("euclidean", self.table1,
                             pairwise_func=not_a_real_pdist)
        expected = DistanceMatrix([[0.0, 42.0], [42.0, 0.0]])
        self.assertEqual(dm1, expected)
예제 #7
0
    def test_euclidean(self):
        # TODO: update npt.assert_almost_equal calls to use DistanceMatrix
        # near-equality testing when that support is available
        actual_dm = beta_diversity('euclidean', self.table1, self.sids1)
        self.assertEqual(actual_dm.shape, (3, 3))
        npt.assert_almost_equal(actual_dm['A', 'A'], 0.0)
        npt.assert_almost_equal(actual_dm['B', 'B'], 0.0)
        npt.assert_almost_equal(actual_dm['C', 'C'], 0.0)
        npt.assert_almost_equal(actual_dm['A', 'B'], 2.23606798)
        npt.assert_almost_equal(actual_dm['B', 'A'], 2.23606798)
        npt.assert_almost_equal(actual_dm['A', 'C'], 4.12310563)
        npt.assert_almost_equal(actual_dm['C', 'A'], 4.12310563)
        npt.assert_almost_equal(actual_dm['B', 'C'], 2.82842712)
        npt.assert_almost_equal(actual_dm['C', 'B'], 2.82842712)

        actual_dm = beta_diversity('euclidean', self.table2, self.sids2)
        expected_data = [
            [0., 80.8455317, 84.0297566, 36.3042697, 86.0116271, 78.9176786],
            [80.8455317, 0., 71.0844568, 74.4714710, 69.3397433, 14.422205],
            [84.0297566, 71.0844568, 0., 77.2851861, 8.3066238, 60.7536007],
            [36.3042697, 74.4714710, 77.2851861, 0., 78.7908624, 70.7389567],
            [86.0116271, 69.3397433, 8.3066238, 78.7908624, 0., 58.4807660],
            [78.9176786, 14.422205, 60.7536007, 70.7389567, 58.4807660, 0.]]
        expected_dm = DistanceMatrix(expected_data, self.sids2)
        for id1 in self.sids2:
            for id2 in self.sids2:
                npt.assert_almost_equal(actual_dm[id1, id2],
                                        expected_dm[id1, id2], 6)
예제 #8
0
    def test_braycurtis(self):
        # TODO: update npt.assert_almost_equal calls to use DistanceMatrix
        # near-equality testing when that support is available
        actual_dm = beta_diversity('braycurtis', self.table1, self.sids1)
        self.assertEqual(actual_dm.shape, (3, 3))
        npt.assert_almost_equal(actual_dm['A', 'A'], 0.0)
        npt.assert_almost_equal(actual_dm['B', 'B'], 0.0)
        npt.assert_almost_equal(actual_dm['C', 'C'], 0.0)
        npt.assert_almost_equal(actual_dm['A', 'B'], 0.27272727)
        npt.assert_almost_equal(actual_dm['B', 'A'], 0.27272727)
        npt.assert_almost_equal(actual_dm['A', 'C'], 0.71428571)
        npt.assert_almost_equal(actual_dm['C', 'A'], 0.71428571)
        npt.assert_almost_equal(actual_dm['B', 'C'], 0.66666667)
        npt.assert_almost_equal(actual_dm['C', 'B'], 0.66666667)

        actual_dm = beta_diversity('braycurtis', self.table2, self.sids2)
        expected_data = [
            [0., 0.78787879, 0.86666667, 0.30927835, 0.85714286, 0.81521739],
            [0.78787879, 0., 0.78142077, 0.86813187, 0.75, 0.1627907],
            [0.86666667, 0.78142077, 0., 0.87709497, 0.09392265, 0.71597633],
            [0.30927835, 0.86813187, 0.87709497, 0., 0.87777778, 0.89285714],
            [0.85714286, 0.75, 0.09392265, 0.87777778, 0., 0.68235294],
            [0.81521739, 0.1627907, 0.71597633, 0.89285714, 0.68235294, 0.]]
        expected_dm = DistanceMatrix(expected_data, self.sids2)
        for id1 in self.sids2:
            for id2 in self.sids2:
                npt.assert_almost_equal(actual_dm[id1, id2],
                                        expected_dm[id1, id2], 6)
예제 #9
0
    def test_braycurtis(self):
        # TODO: update npt.assert_almost_equal calls to use DistanceMatrix
        # near-equality testing when that support is available
        actual_dm = beta_diversity('braycurtis', self.table1, self.sids1)
        self.assertEqual(actual_dm.shape, (3, 3))
        npt.assert_almost_equal(actual_dm['A', 'A'], 0.0)
        npt.assert_almost_equal(actual_dm['B', 'B'], 0.0)
        npt.assert_almost_equal(actual_dm['C', 'C'], 0.0)
        npt.assert_almost_equal(actual_dm['A', 'B'], 0.27272727)
        npt.assert_almost_equal(actual_dm['B', 'A'], 0.27272727)
        npt.assert_almost_equal(actual_dm['A', 'C'], 0.71428571)
        npt.assert_almost_equal(actual_dm['C', 'A'], 0.71428571)
        npt.assert_almost_equal(actual_dm['B', 'C'], 0.66666667)
        npt.assert_almost_equal(actual_dm['C', 'B'], 0.66666667)

        actual_dm = beta_diversity('braycurtis', self.table2, self.sids2)
        expected_data = [
            [0., 0.78787879, 0.86666667, 0.30927835, 0.85714286, 0.81521739],
            [0.78787879, 0., 0.78142077, 0.86813187, 0.75, 0.1627907],
            [0.86666667, 0.78142077, 0., 0.87709497, 0.09392265, 0.71597633],
            [0.30927835, 0.86813187, 0.87709497, 0., 0.87777778, 0.89285714],
            [0.85714286, 0.75, 0.09392265, 0.87777778, 0., 0.68235294],
            [0.81521739, 0.1627907, 0.71597633, 0.89285714, 0.68235294, 0.]
        ]
        expected_dm = DistanceMatrix(expected_data, self.sids2)
        for id1 in self.sids2:
            for id2 in self.sids2:
                npt.assert_almost_equal(actual_dm[id1, id2],
                                        expected_dm[id1, id2], 6)
예제 #10
0
 def test_weighted_unifrac_normalized(self):
     # TODO: update npt.assert_almost_equal calls to use DistanceMatrix
     # near-equality testing when that support is available
     # expected values calculated by hand
     dm1 = beta_diversity('weighted_unifrac',
                          self.table1,
                          self.sids1,
                          otu_ids=self.oids1,
                          tree=self.tree1,
                          normalized=True)
     dm2 = beta_diversity(weighted_unifrac,
                          self.table1,
                          self.sids1,
                          otu_ids=self.oids1,
                          tree=self.tree1,
                          normalized=True)
     self.assertEqual(dm1.shape, (3, 3))
     self.assertEqual(dm1, dm2)
     expected_data = [[0.0, 0.128834, 0.085714], [0.128834, 0.0, 0.2142857],
                      [0.085714, 0.2142857, 0.0]]
     expected_dm = DistanceMatrix(expected_data, ids=self.sids1)
     for id1 in self.sids1:
         for id2 in self.sids1:
             npt.assert_almost_equal(dm1[id1, id2], expected_dm[id1, id2],
                                     6)
예제 #11
0
 def test_qualitative_bug_issue_1549(self):
     mat = np.array([[42, 0, 37, 99, 1],
                     [12, 1, 22, 88, 0],
                     [25, 3, 23, 86, 0],
                     [0, 0, 87, 12, 0]])
     as_presence_absence = mat > 0
     obs_mat = beta_diversity('jaccard', mat)
     obs_presence_absence = beta_diversity('jaccard', as_presence_absence)
     self.assertEqual(obs_mat, obs_presence_absence)
예제 #12
0
    def test_scipy_kwargs(self):
        # confirm that p can be passed to SciPy's minkowski, and that it
        # gives a different result than not passing it (the off-diagonal
        # entries are not equal).
        dm1 = beta_diversity('minkowski', self.table1, self.sids1)
        dm2 = beta_diversity('minkowski', self.table1, self.sids1, p=42.0)

        for id1 in self.sids1:
            for id2 in self.sids1:
                if id1 != id2:
                    self.assertNotEqual(dm1[id1, id2], dm2[id1, id2])
예제 #13
0
    def test_scipy_kwargs(self):
        # confirm that p can be passed to SciPy's minkowski, and that it
        # gives a different result than not passing it (the off-diagonal
        # entries are not equal).
        dm1 = beta_diversity('minkowski', self.table1, self.sids1)
        dm2 = beta_diversity('minkowski', self.table1, self.sids1, p=42.0)

        for id1 in self.sids1:
            for id2 in self.sids1:
                if id1 != id2:
                    self.assertNotEqual(dm1[id1, id2], dm2[id1, id2])
예제 #14
0
 def __dapply__(self, experiment):
     otu_ids = experiment.data_df.index
     df = experiment.data_df.transpose()
     try:
         dm = beta_diversity(self.distance_metric, counts=df.as_matrix(), otu_ids=otu_ids, **self.kwargs)
     except TypeError as e:
         if 'takes no keyword arguments' in str(e):
             dm = beta_diversity(self.distance_metric, counts=df.as_matrix(), **self.kwargs)
         else:
             raise(e)
         
     distance_matrix_df = pd.DataFrame(dm.data, index=df.index, columns=df.index)
     return distance_matrix_df
예제 #15
0
    def test_empty(self):
        # array of empty vectors
        actual = beta_diversity('euclidean',
                                np.array([[], []], dtype=np.int64),
                                ids=['a', 'b'])
        expected_dm = DistanceMatrix([[0.0, 0.0], [0.0, 0.0]], ['a', 'b'])
        npt.assert_array_equal(actual, expected_dm)

        actual = beta_diversity('unweighted_unifrac',
                                np.array([[], []], dtype=np.int64),
                                ids=['a', 'b'], tree=self.tree1, otu_ids=[])
        expected_dm = DistanceMatrix([[0.0, 0.0], [0.0, 0.0]], ['a', 'b'])
        self.assertEqual(actual, expected_dm)
예제 #16
0
    def test_empty(self):
        # array of empty vectors
        actual = beta_diversity('euclidean',
                                np.array([[], []], dtype=np.int64),
                                ids=['a', 'b'])
        expected_dm = DistanceMatrix([[0.0, 0.0], [0.0, 0.0]], ['a', 'b'])
        npt.assert_array_equal(actual, expected_dm)

        actual = beta_diversity('unweighted_unifrac',
                                np.array([[], []], dtype=np.int64),
                                ids=['a', 'b'], tree=self.tree1, otu_ids=[])
        expected_dm = DistanceMatrix([[0.0, 0.0], [0.0, 0.0]], ['a', 'b'])
        self.assertEqual(actual, expected_dm)
예제 #17
0
파일: __init__.py 프로젝트: alecos/q2d2
def compute_distance_matrices(
               biom,
               tree=None,
               metrics=['weighted_unifrac', 'unweighted_unifrac', 'braycurtis', 'jaccard']):
    dms = {}
    for metric in metrics:
        if metric in ['unweighted_unifrac', 'weighted_unifrac']:
            dms[metric] = beta_diversity(metric, counts=np.asarray(biom.T),
                                         ids=biom.columns, otu_ids=biom.index,
                                         tree=tree)
        else:
            dms[metric] = beta_diversity(metric, counts=np.asarray(biom.T),
                                         ids=biom.columns)
    return dms
예제 #18
0
 def compute_beta(self, metric="unweighted_unifrac"):
     if "unifrac" not in metric:
         dist_mat = beta_diversity(metric, self.otu_df, self.sample_ids)
         dist_mat = pd.DataFrame(dist_mat.data)
     else:
         dist_mat = self.__beta_unifrac(metric)
     return dist_mat
예제 #19
0
def diversity(df_sv_list):
    """ use skbio to compute different diversity metrics"""

    richness = pd.DataFrame(index=allsamples)
    shannon = pd.DataFrame(index=allsamples)
    bc_dm_list = []
    for i, df in enumerate(df_sv_list):
        data = df.iloc[:,
                       1:].T.values  #columns are the SVs and rows are the samples
        ids = df.columns[1:]  #ids should have the same order as the data rows
        #richness
        richness = richness.merge(pd.DataFrame(
            alpha_diversity("observed_otus", data, ids)),
                                  how="left",
                                  left_index=True,
                                  right_index=True)
        richness.rename(columns={0: df_sv_list_names[i]}, inplace=True)
        #shannon
        shannon = shannon.merge(pd.DataFrame(
            alpha_diversity("shannon", data, ids)),
                                how="left",
                                left_index=True,
                                right_index=True)
        shannon.rename(columns={0: df_sv_list_names[i]}, inplace=True)
        #bray-curtis distance matrix:
        bc_dm = beta_diversity("braycurtis", data, ids)
        temp_bc = pd.DataFrame(index=bc_dm.ids, columns=bc_dm.ids)
        temp_bc.iloc[:, :] = bc_dm.data
        bc_dm_list.append(temp_bc)
    return richness, shannon, bc_dm_list
예제 #20
0
    def beta_diversity_heatmap(self, working_samples, samples_list, tax_level):
        """  """
        from skbio.diversity import beta_diversity
        import seaborn as sns

        if self.abundance_df.groupAbsoluteSamples() is not None:
            data0 = self.abundance_df.groupAbsoluteSamples(
            )[samples_list].astype('int')
            ids = list(data0.columns)
            data = data0.transpose().values.tolist()

            bc_dm = beta_diversity("braycurtis", data, ids)

            g = sns.clustermap(pd.DataFrame(bc_dm.data, index=ids,
                                            columns=ids),
                               metric='braycurtis',
                               annot_kws={"size": 8})

            self.save_high_resolution_figure(
                g,
                'Select file to save the beta diversity heatmap',
                'beta_diversity_heatmap',
                defaultextension='.png')

            import matplotlib.pyplot as plt
            plt.close("all")
예제 #21
0
    def test_weighted_unifrac_partial_full(self):
        # TODO: update npt.assert_almost_equal calls to use DistanceMatrix
        # near-equality testing when that support is available
        # expected values calculated by hand
        dm1 = partial_beta_diversity('weighted_unifrac',
                                     self.table1,
                                     self.sids1,
                                     otu_ids=self.oids1,
                                     tree=self.tree1,
                                     id_pairs=[('A', 'B'), ('A', 'C'),
                                               ('B', 'C')])
        dm2 = beta_diversity('weighted_unifrac',
                             self.table1,
                             self.sids1,
                             otu_ids=self.oids1,
                             tree=self.tree1)

        self.assertEqual(dm1.shape, (3, 3))
        self.assertEqual(dm1, dm2)
        expected_data = [[0.0, 0.1750000, 0.12499999],
                         [0.1750000, 0.0, 0.3000000],
                         [0.12499999, 0.3000000, 0.0]]
        expected_dm = DistanceMatrix(expected_data, ids=self.sids1)
        for id1 in self.sids1:
            for id2 in self.sids1:
                npt.assert_almost_equal(dm1[id1, id2], expected_dm[id1, id2],
                                        6)
예제 #22
0
 def test_block_beta_diversity(self):
     exp = beta_diversity('unweighted_unifrac', self.table1, self.sids1,
                          tree=self.tree1, otu_ids=self.oids1)
     obs = block_beta_diversity('unweighted_unifrac', self.table1,
                                self.sids1, otu_ids=self.oids1,
                                tree=self.tree1, k=2)
     npt.assert_equal(obs.data, exp.data)
     self.assertEqual(obs.ids, exp.ids)
예제 #23
0
def data_diversity(data):
    ids = list((np.linspace(1, data.shape[0],
                            num=data.shape[0])).astype('int'))
    bc_dm = beta_diversity("braycurtis", np.absolute(data), ids)
    q = bc_dm.to_data_frame().to_numpy()
    t = np.mean(q, axis=0)
    avg_div_score = np.mean(t)
    return t, avg_div_score
예제 #24
0
 def test_unweighted_unifrac(self):
     # TODO: update npt.assert_almost_equal calls to use DistanceMatrix
     # near-equality testing when that support is available
     # expected values calculated by hand
     dm1 = beta_diversity('unweighted_unifrac', self.table1, self.sids1,
                          otu_ids=self.oids1, tree=self.tree1)
     dm2 = beta_diversity(unweighted_unifrac, self.table1, self.sids1,
                          otu_ids=self.oids1, tree=self.tree1)
     self.assertEqual(dm1.shape, (3, 3))
     self.assertEqual(dm1, dm2)
     expected_data = [[0.0, 0.0, 0.25],
                      [0.0, 0.0, 0.25],
                      [0.25, 0.25, 0.0]]
     expected_dm = DistanceMatrix(expected_data, ids=self.sids1)
     for id1 in self.sids1:
         for id2 in self.sids1:
             npt.assert_almost_equal(dm1[id1, id2],
                                     expected_dm[id1, id2], 6)
예제 #25
0
파일: __init__.py 프로젝트: alecos/q2d2
def compute_distance_matrices(biom,
                              tree=None,
                              metrics=[
                                  'weighted_unifrac', 'unweighted_unifrac',
                                  'braycurtis', 'jaccard'
                              ]):
    dms = {}
    for metric in metrics:
        if metric in ['unweighted_unifrac', 'weighted_unifrac']:
            dms[metric] = beta_diversity(metric,
                                         counts=np.asarray(biom.T),
                                         ids=biom.columns,
                                         otu_ids=biom.index,
                                         tree=tree)
        else:
            dms[metric] = beta_diversity(metric,
                                         counts=np.asarray(biom.T),
                                         ids=biom.columns)
    return dms
예제 #26
0
    def __dapply__(self, experiment):
        otu_ids = experiment.data_df.index
        df = experiment.data_df.transpose()
        try:
            dm = beta_diversity(self.distance_metric,
                                counts=df.as_matrix(),
                                otu_ids=otu_ids,
                                **self.kwargs)
        except TypeError as e:
            if 'takes no keyword arguments' in str(e):
                dm = beta_diversity(self.distance_metric,
                                    counts=df.as_matrix(),
                                    **self.kwargs)
            else:
                raise (e)

        distance_matrix_df = pd.DataFrame(dm.data,
                                          index=df.index,
                                          columns=df.index)
        return distance_matrix_df
예제 #27
0
 def test_block_beta_diversity(self):
     exp = beta_diversity('unweighted_unifrac',
                          self.table1,
                          self.sids1,
                          tree=self.tree1,
                          otu_ids=self.oids1)
     obs = block_beta_diversity('unweighted_unifrac',
                                self.table1,
                                self.sids1,
                                otu_ids=self.oids1,
                                tree=self.tree1,
                                k=2)
     npt.assert_equal(obs.data, exp.data)
     self.assertEqual(obs.ids, exp.ids)
예제 #28
0
def compute_beta_braycurtis(df):
    from skbio.diversity import beta_diversity
    l = []
    ids = []
    for i in range(0, len(df.columns)):
        l.append(df.iloc[:, i])
        ids.append(df.columns[i])
    array = np.array(l)
    counts = array.astype(int)
    r = beta_diversity(metric="braycurtis", counts = counts, ids = ids)
    bm = []
    r_df = r.to_data_frame()
    for i in range(0, len(r_df)):
        bm.append(list(r_df.iloc[i, :]))
    return bm
예제 #29
0
def make_distance_matrix(biom_fp, method="braycurtis"):
    '''biom.Table --> skbio.DistanceMatrix'''
    table = load_table(biom_fp)

    # extract sample metadata from table, put in df
    table_md = {s_id: dict(table.metadata(s_id)) for s_id in table.ids()}
    s_md = pd.DataFrame.from_dict(table_md, orient='index')

    # extract data from table and multiply, assuming that table contains
    # relative abundances (which cause beta_diversity to fail)
    table_data = [[int(num * 100000) for num in table.data(s_id)]
                  for s_id in table.ids()]

    # beta diversity
    dm = beta_diversity(method, table_data, table.ids())

    return dm, s_md
def get_dist(metric, mtx):
    #print(mtx)
    #print(distance.squareform(mtx))
    if metric == 'bray_curtis':
        #dtvar = dt.dist_bray_curtis(mtx, strict=False)
        #dtvar = distance.braycurtis(mtx, strict=False)
        dm1 = beta_diversity("braycurtis", mtx )

    elif metric == 'morisita_horn':
        #dtvar = dt.dist_morisita_horn(mtx, strict=False)
        #dtvar = distance.dist_morisita_horn(mtx, strict=False)
        dm1 = beta_diversity("braycurtis", mtx )
    elif metric == 'canberra':
        #dtvar = dt.dist_canberra(mtx, strict=False)
        #dtvar = distance.canberra(mtx, strict=False)
        dm1 = beta_diversity("canberra", mtx )
    elif metric == 'jaccard':
        #dtvar = dt.binary_dist_jaccard(mtx, strict=False)
        #dtvar = distance.jaccard(mtx)
        dm1 = beta_diversity("jaccard", mtx )
    elif metric == 'kulczynski':
        #dtvar = dt.dist_kulczynski(mtx, strict=False)
        #dtvar = distance.kulczynski(mtx, strict=False)
        dm1 = beta_diversity("kulczynski", mtx )
    elif metric == "yue-clayton":   # not availible in python?
        # R-code
        #stand <-decostand(data.matrix(biods),"total");
        #dis<-designdist(stand, method="1-(J/(A+B-J))",terms = c( "quadratic"), abcd = FALSE)
        dm1 = beta_diversity("braycurtis", mtx )
    elif metric == "correlation":
        dm1 = beta_diversity("correlation", mtx )

    else:  # default
        #dtvar = dt.dist_bray_curtis(mtx, strict=False)
        #dtvar = dt.dist_bray_curtis(mtx, strict=False)
        dm1 = beta_diversity("braycurtis", mtx )


## http://stackoverflow.com/questions/38384329/how-to-get-skbio-pcoa-principal-coordinate-analysis-results

    return dm1
예제 #31
0
    def beta(self,
             table: biom.Table,
             metric: str,
             pseudocount: int = 1,
             n_jobs: int = 1):
        counts = table.matrix_data.toarray().T

        if table.is_empty():
            raise ValueError("The provided table object is empty")

        sample_ids = table.ids(axis='sample')
        beta_dv = beta_diversity(
            metric=metric,
            counts=counts,
            ids=sample_ids,
            validate=True,
            pairwise_func=sklearn.metrics.pairwise_distances,
            n_jobs=n_jobs)
        return beta_dv
예제 #32
0
    def compute_beta_diversity(self):
        """Compute and cache beta diversity values

        This method calculates a beta diversity distance matrix and saves it
        to a folder for re-use. The matrices are calculated based on the full
        dataset so that any subsample drawn from the full dataset can be
        fetched from these precomputed matrices.

        See Also
        --------
        Sculptor.compute_alpha_diversity
        """
        dir_fp = 'roc-curves/%s/cached-matrices/' % self.name
        os.makedirs(dir_fp, exist_ok=True)

        X = self._original_bt.matrix_data.toarray().astype(np.int).T

        self._beta_diversity_matrices = {}

        for metric in self._beta_metrics:
            fp = os.path.join(dir_fp, metric + '.full.txt')

            if os.path.exists(fp):
                distance_matrix = DistanceMatrix.read(fp)
            else:
                if metric in {'unweighted_unifrac', 'weighted_unifrac'}:
                    kws = {
                        'tree': self.tree,
                        'otu_ids': self._original_bt.ids('observation')
                    }
                else:
                    kws = {}

                distance_matrix = beta_diversity(metric, X,
                                                 self._original_bt.ids(),
                                                 **kws)

                distance_matrix.write(fp)

            self._beta_diversity_matrices[metric] = distance_matrix
예제 #33
0
    def test_weighted_unifrac_partial_full(self):
        # TODO: update npt.assert_almost_equal calls to use DistanceMatrix
        # near-equality testing when that support is available
        # expected values calculated by hand
        dm1 = partial_beta_diversity('weighted_unifrac', self.table1,
                                     self.sids1, otu_ids=self.oids1,
                                     tree=self.tree1, id_pairs=[('A', 'B'),
                                                                ('A', 'C'),
                                                                ('B', 'C')])
        dm2 = beta_diversity('weighted_unifrac', self.table1, self.sids1,
                             otu_ids=self.oids1, tree=self.tree1)

        self.assertEqual(dm1.shape, (3, 3))
        self.assertEqual(dm1, dm2)
        expected_data = [
            [0.0, 0.1750000, 0.12499999],
            [0.1750000, 0.0, 0.3000000],
            [0.12499999, 0.3000000, 0.0]]
        expected_dm = DistanceMatrix(expected_data, ids=self.sids1)
        for id1 in self.sids1:
            for id2 in self.sids1:
                npt.assert_almost_equal(dm1[id1, id2],
                                        expected_dm[id1, id2], 6)
예제 #34
0
def get_pairwise_dist_mat(deblur_biom, dist_type):
    """Returns pairwise distance matrix for deblurred seqs

    Parameters
    ----------
    deblur_biom: biom.Table
        Sequences we want pairwise distances (by sample) for
    dist_type: str
        Distance metric we want. Usually "jaccard" or "braycurtis"

    Returns
    -------
    numpy matrix of pairwise distances
    """
    if(dist_type == "jaccard"):
        deblur_biom = deblur_biom.pa(inplace=False)

    print("starting beta_diversity")
    dist_mat = beta_diversity(dist_type,
                              deblur_biom.transpose().matrix_data.astype("int64").todense(),
                              ids = deblur_biom.ids(axis="sample"))
    print("end beta_diversity")
    return dist_mat
예제 #35
0
#####################

adiv_obs_otuss = alpha_diversity('observed_otus', datas, idss)

adiv_faith_pds = alpha_diversity('faith_pd',
                                 datas,
                                 ids=idss,
                                 otu_ids=dfs1.columns,
                                 tree=tree,
                                 validate=False)

#bc_dm = beta_diversity("braycurtis", data, ids, validate=False)

wu_dms = beta_diversity("weighted_unifrac",
                        datas,
                        idss,
                        tree=tree,
                        otu_ids=dfs1.columns,
                        validate=False)
print(wu_dms)
o = open("beta.tsv", "w")
o.write(str(wu_dms))

wu_pcs = pcoa(wu_dms)
print(wu_pcs)

#wu_pc.write("eigen.tsv", format='ordination')
#subprocess.call("sed -n '2p' eigen.tsv > eigen_input.tsv", shell=True)
#subprocess.call("sed -1 '1 i /PC1\tPC2\tPC3' eigen_input.tsv", shell=True)
eigen = pd.read_csv("eigen_input.tsv", sep="\t", header=None)
eigen.columns = ['PC1', 'PC2', 'PC3']
eigen = eigen.round(3)
예제 #36
0
def betadiv_clustering(TaXon_table_xlsx, height, width, threshold,
                       betadiv_linkage, taxonomic_level, path_to_outdirs,
                       template, font_size, diss_metric):

    from scipy.cluster.hierarchy import dendrogram, linkage
    import plotly.figure_factory as ff
    import numpy as np
    import pandas as pd
    from skbio.diversity import beta_diversity
    from pathlib import Path
    import PySimpleGUI as sg
    import webbrowser

    ## import table
    TaXon_table_xlsx = Path(TaXon_table_xlsx)
    TaXon_table_df = pd.read_excel(TaXon_table_xlsx,
                                   header=0).fillna("unidentified")

    ## create a y axis title text
    taxon_title = taxonomic_level.lower()

    ## adjust taxonomic level if neccessary
    if taxonomic_level in ["ASVs", "ESVs", "OTUs", "zOTUs"]:
        taxon_title = taxonomic_level
        taxonomic_level = "ID"

    ## collect samples for plot
    samples = TaXon_table_df.columns.tolist()[10:]

    ## extract the relevant data
    TaXon_table_df = TaXon_table_df[[taxonomic_level] + samples]
    ## define an aggregation function to combine multiple hit of one taxonimic level
    aggregation_functions = {}
    ## define samples functions
    for sample in samples:
        ## 'sum' will calculate the sum of p/a data
        aggregation_functions[sample] = 'sum'
    ## define taxon level function
    aggregation_functions[taxonomic_level] = 'first'
    ## create condensed dataframe
    df_new = TaXon_table_df.groupby(
        TaXon_table_df[taxonomic_level]).aggregate(aggregation_functions)
    if 'unidentified' in df_new.index:
        df_new = df_new.drop('unidentified')

    ## collect reads
    data = df_new[samples].transpose().values.tolist()
    ## calculate jaccard distances
    dissimilarity_dm = beta_diversity(diss_metric, data, samples)
    ## convert to distance matrix
    X1 = dissimilarity_dm.data
    matrix_df = pd.DataFrame(X1)
    matrix_df.columns = samples
    matrix_df.index = samples
    ## convert to 2D array
    X2 = dissimilarity_dm.condensed_form()
    ## cluster dendrogram
    fig = ff.create_dendrogram(
        X1,
        labels=samples,
        color_threshold=float(threshold),
        orientation="left",
        linkagefun=lambda x: linkage(X2, betadiv_linkage, metric=diss_metric))
    fig.update_yaxes(ticks="")
    fig.update_xaxes(title="A")
    title = str(diss_metric) + " distance"
    fig.update_layout(xaxis_title=title,
                      height=int(height),
                      width=int(width),
                      template=template,
                      font_size=font_size,
                      title_font_size=font_size)

    # finish script
    output_pdf = Path(
        str(path_to_outdirs) + "/" + "Beta_diversity" + "/" +
        TaXon_table_xlsx.stem + "_" + taxon_title + "_dendrogram_" +
        diss_metric + ".pdf")
    output_html = Path(
        str(path_to_outdirs) + "/" + "Beta_diversity" + "/" +
        TaXon_table_xlsx.stem + "_" + taxon_title + "_dendrogram_" +
        diss_metric + ".html")
    output_xlsx = Path(
        str(path_to_outdirs) + "/" + "Beta_diversity" + "/" +
        TaXon_table_xlsx.stem + "_" + taxon_title + "_dendrogram_" +
        diss_metric + ".xlsx")
    fig.write_image(str(output_pdf))
    fig.write_html(str(output_html))
    matrix_df.to_excel(output_xlsx)

    ## ask to show plot
    answer = sg.PopupYesNo('Show plot?', keep_on_top=True)
    if answer == "Yes":
        webbrowser.open('file://' + str(output_html))

    ## write to log file
    sg.Popup(diss_metric + " clustering dendrograms are found in",
             path_to_outdirs,
             "/Beta_diversity/",
             title="Finished",
             keep_on_top=True)
    from taxontabletools.create_log import ttt_log
    ttt_log(diss_metric + " clustering", "analysis", TaXon_table_xlsx.name,
            output_pdf.name, "", path_to_outdirs)
예제 #37
0
def beta_diversity(TaXon_table_xlsx, width, heigth, cmap, meta_data_to_test,
                   taxonomic_level, path_to_outdirs, template, font_size,
                   diss_metric):

    import pandas as pd
    import numpy as np
    from skbio.diversity import beta_diversity
    from skbio.stats.distance import anosim
    import plotly.express as px
    from pathlib import Path
    import PySimpleGUI as sg
    import webbrowser

    TaXon_table_xlsx = Path(TaXon_table_xlsx)
    Meta_data_table_xlsx = Path(
        str(path_to_outdirs) + "/" + "Meta_data_table" + "/" +
        TaXon_table_xlsx.stem + "_metadata.xlsx")
    TaXon_table_df = pd.read_excel(TaXon_table_xlsx,
                                   header=0).fillna("unidentified")
    TaXon_table_samples = TaXon_table_df.columns.tolist()[10:]
    Meta_data_table_df = pd.read_excel(Meta_data_table_xlsx,
                                       header=0).fillna("nan")
    Meta_data_table_samples = Meta_data_table_df['Samples'].tolist()

    metadata_list = Meta_data_table_df[meta_data_to_test].values.tolist()
    metadata_loc = Meta_data_table_df.columns.tolist().index(meta_data_to_test)

    ## drop samples with metadata called nan (= empty)
    drop_samples = [
        i[0] for i in Meta_data_table_df.values.tolist()
        if i[metadata_loc] == "nan"
    ]

    if drop_samples != []:
        ## filter the TaXon table
        TaXon_table_df = TaXon_table_df.drop(drop_samples, axis=1)
        TaXon_table_samples = TaXon_table_df.columns.tolist()[10:]
        ## also remove empty OTUs
        row_filter_list = []
        for row in TaXon_table_df.values.tolist():
            reads = set(row[10:])
            if reads != {0}:
                row_filter_list.append(row)
        columns = TaXon_table_df.columns.tolist()
        TaXon_table_df = pd.DataFrame(row_filter_list, columns=columns)
        Meta_data_table_df = pd.DataFrame(
            [
                i for i in Meta_data_table_df.values.tolist()
                if i[0] not in drop_samples
            ],
            columns=Meta_data_table_df.columns.tolist())
        Meta_data_table_samples = Meta_data_table_df['Samples'].tolist()

    metadata_list = Meta_data_table_df[meta_data_to_test].values.tolist()

    ## create a y axis title text
    taxon_title = taxonomic_level

    ## adjust taxonomic level if neccessary
    if taxonomic_level in ["ASVs", "ESVs", "OTUs", "zOTUs"]:
        taxon_title = taxonomic_level
        taxonomic_level = "ID"

    # check if the meta data differs
    if len(set(Meta_data_table_df[meta_data_to_test])) == len(
            Meta_data_table_df['Samples'].tolist()):
        sg.Popup(
            "The meta data is unique for all samples. Please adjust the meta data table!",
            title=("Error"))
        raise RuntimeError

    # check if the meta data differs
    if len(set(Meta_data_table_df[meta_data_to_test])) == 1:
        sg.Popup(
            "The meta data is similar for all samples. Please adjust the meta data table!",
            title=("Error"))
        raise RuntimeError

    if sorted(TaXon_table_samples) == sorted(Meta_data_table_samples):

        ## collect samples for plot
        samples = Meta_data_table_samples

        ## extract the relevant data
        TaXon_table_df = TaXon_table_df[[taxonomic_level] + samples]
        ## define an aggregation function to combine multiple hit of one taxonimic level
        aggregation_functions = {}
        ## define samples functions
        for sample in samples:
            ## 'sum' will calculate the sum of p/a data
            aggregation_functions[sample] = 'sum'
        ## define taxon level function
        aggregation_functions[taxonomic_level] = 'first'
        ## create condensed dataframe
        df_new = TaXon_table_df.groupby(
            TaXon_table_df[taxonomic_level]).aggregate(aggregation_functions)
        if 'unidentified' in df_new.index:
            df_new = df_new.drop('unidentified')

        ## collect reads
        data = df_new[samples].transpose().values.tolist()
        ## calculate dissimilarity distances
        dissimilarity_dm = beta_diversity(diss_metric, data, samples)

        anosim_results = anosim(dissimilarity_dm,
                                metadata_list,
                                permutations=999)
        anosim_r = round(anosim_results['test statistic'], 5)
        anosim_p = anosim_results['p-value']
        textbox = "Anosim (" + meta_data_to_test + ", " + taxon_title + ")<br>" + "R = " + str(
            anosim_r) + "<br>" + "p = " + str(anosim_p)

        matrix = dissimilarity_dm.data
        matrix_df = pd.DataFrame(matrix)
        matrix_df.columns = samples
        matrix_df.index = samples

        # create plot
        color_label = diss_metric + " distance"
        fig = px.imshow(matrix,
                        x=samples,
                        y=samples,
                        color_continuous_scale=cmap,
                        labels=dict(color=color_label))
        fig.update_layout(height=int(heigth),
                          width=int(width),
                          template=template,
                          showlegend=True,
                          title=textbox,
                          font_size=font_size,
                          title_font_size=font_size)

        # finish script
        output_pdf = Path(
            str(path_to_outdirs) + "/" + "Beta_diversity" + "/" +
            TaXon_table_xlsx.stem + "_" + meta_data_to_test + "_" +
            taxon_title + "_" + diss_metric + ".pdf")
        output_html = Path(
            str(path_to_outdirs) + "/" + "Beta_diversity" + "/" +
            TaXon_table_xlsx.stem + "_" + meta_data_to_test + "_" +
            taxon_title + "_" + diss_metric + ".html")
        output_xlsx = Path(
            str(path_to_outdirs) + "/" + "Beta_diversity" + "/" +
            TaXon_table_xlsx.stem + "_" + meta_data_to_test + "_" +
            taxon_title + "_" + diss_metric + ".xlsx")
        fig.write_image(str(output_pdf))
        fig.write_html(str(output_html))
        matrix_df.to_excel(output_xlsx)

        ## ask to show plot
        answer = sg.PopupYesNo('Show plot?', keep_on_top=True)
        if answer == "Yes":
            webbrowser.open('file://' + str(output_html))

        ## write to log file
        sg.Popup("Beta diversity estimate are found in",
                 path_to_outdirs,
                 "/Beta_diversity/",
                 title="Finished",
                 keep_on_top=True)
        from taxontabletools.create_log import ttt_log
        ttt_log("beta diversity", "analysis", TaXon_table_xlsx.name,
                output_pdf.name, meta_data_to_test, path_to_outdirs)

    else:
        sg.PopupError(
            "Error: The samples between the taxon table and meta table do not match!",
            keep_on_top=True)
def __main__():
    parser = optparse.OptionParser( usage="%prog [options]" )
    parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='print version and exit' )
    parser.add_option( '-i', '--input', dest='input', action='store', type="string", default=None, help='Input abundance Filename' )
    parser.add_option( '', '--otu_column', dest='otu_column', action='store', type="int", default=None, help='OTU ID Column (1 based)' )
    parser.add_option( '', '--sample_columns', dest='sample_columns', action='store', type="string", default=None, help='Comma separated list of sample columns, unset to use all.' )
    parser.add_option( '', '--header', dest='header', action='store_true', default=False, help='Abundance file has a header line' )
    parser.add_option( '', '--distance_metric', dest='distance_metric', action='store', type="string", default=None, help='Distance metric to use' )
    parser.add_option( '', '--tree', dest='tree', action='store', type="string", default=None, help='Newick Tree Filename' )
    parser.add_option( '-o', '--output', dest='output', action='store', type="string", default=None, help='Output Filename' )
    (options, args) = parser.parse_args()
    if options.version:
        print >> sys.stderr, "scikit-bio betadiversity from tabular file", __VERSION__
        sys.exit()

    if options.otu_column is not None:
        otu_column = options.otu_column - 1
    else:
        otu_column = None

    if options.sample_columns is None:
        with open( options.input, 'rb' ) as fh:
            line = fh.readline()
            columns = range( len( line.split( DELIMITER ) ) )
            if otu_column in columns:
                columns.remove( otu_column )
    else:
        columns = map( lambda x: int( x ) - 1, options.sample_columns.split( "," ) )

    max_col = max( columns + [otu_column] )
    counts = [ [] for x in columns ]
    sample_names = []
    otu_names = []
    with open( options.input, 'rb' ) as fh:
        if options.header:
            header = fh.readline().rstrip('\n\r').split( DELIMITER )
            sample_names = [ header[i] for i in columns ]
        else:
            sample_names = [ "SAMPLE_%i" % x for x in range( len( columns ) ) ]
        for i, line in enumerate( fh ):
            fields = line.rstrip('\n\r').split( DELIMITER )
            if len(fields) <= max_col:
                print >> sys.stederr, "Bad data line: ", fields
                continue
            if otu_column is not None:
                otu_names.append( fields[ otu_column ] )
            else:
                otu_names.append( "OTU_%i" % i )
            for j, col in enumerate( columns ):
                counts[ j ].append( int( fields[ col ] ) )

    extra_kwds = {}
    if options.distance_metric in NEEDS_OTU_NAMES:
        extra_kwds['otu_ids'] = otu_names
    if options.distance_metric in NEEDS_TREE:
        assert options.tree, Exception( "You must provide a newick tree when using '%s'" % options.distance_metric )
        # NB: TreeNode apparently needs unicode files
        with codecs.open( options.tree, 'rb', 'utf-8' ) as fh:
            extra_kwds['tree'] = TreeNode.read( fh )

    bd_dm = beta_diversity( options.distance_metric, counts, ids=sample_names, **extra_kwds )
    bd_dm.write( options.output )
예제 #39
0
        lambda x: pd.Series(
            subsample_counts(x.astype("int"), depth), index=counts.columns
        ),
        axis=1,
    )
    return rare


log.info("Reading genus-level data.")
genera = pd.read_csv(
    path.join("..", "data", "american_gut_genus.csv"), dtype={"id": str}
)
libsize = genera.groupby("id")["count"].sum()

mat = pd.pivot_table(
    genera,
    columns="Genus",
    index="id",
    values="count",
    fill_value=0,
    aggfunc="sum",
)
mat = rarefy_counts(mat, 1000)

log.info("Calculating beta diversity and PCoA.")
D = beta_diversity("braycurtis", mat.values, mat.index, validate=True)
red = pcoa(D, number_of_dimensions=2)

log.info("Saving results to `pcoa.csv`.")
red.samples.to_csv("pcoa.csv")
예제 #40
0
    def analyse(self, user_request, base, headers, sample_labels,
                metadata_vals, phylogenetic_tree):
        logger.info("Starting NMDS analysis")
        type = user_request.get_custom_attr("type")

        if type == "weighted_unifrac" or type == "unweighted_unifrac":
            if phylogenetic_tree == "":
                return {"no_tree": True}

            project_map = Map(user_request.user_id, user_request.pid)
            if project_map.matrix_type == "float":
                return {"has_float": True}

            base = base.astype(int)
            tree = TreeNode.read(StringIO(phylogenetic_tree))
            if len(tree.root().children) > 2:
                # Ensure that the tree is rooted if it is not already rooted
                tree = tree.root_at_midpoint()
            dist_matrix = beta_diversity(type,
                                         base,
                                         ids=sample_labels,
                                         otu_ids=headers,
                                         tree=tree)
        elif type == "euclidean":
            dist_matrix = euclidean_distances(base)
        else:
            base = base.astype(int)
            dist_matrix = beta_diversity(type, base)

        similarities = []
        i = 0
        while i < dist_matrix.shape[0]:
            new_row = []
            j = 0
            while j < dist_matrix.shape[0]:
                new_row.append(dist_matrix[i][j])
                j += 1
            similarities.append(new_row)
            i += 1

        # Use traditional MDS to determine the initial position
        mds = manifold.MDS(n_components=2,
                           max_iter=3000,
                           eps=1e-9,
                           dissimilarity="precomputed",
                           n_jobs=1)
        pos = mds.fit(similarities).embedding_
        # Use NMDS to adjust the original positions to optimize for stress
        nmds = manifold.MDS(n_components=2,
                            metric=False,
                            dissimilarity="precomputed",
                            max_iter=3000,
                            eps=1e-12)
        npos = nmds.fit_transform(similarities, init=pos)

        ret_table = []
        i = 0
        while i < len(npos):
            meta = ""
            if metadata_vals and len(metadata_vals) > 0:
                meta = metadata_vals[i]
            obj = {
                "s": sample_labels[i],
                "m": meta,
                "nmds1": npos[i][0],
                "nmds2": npos[i][1],
            }

            ret_table.append(obj)
            i += 1

        logger.info("After NMDS plotting")

        buffer = 1.5
        abundancesObj = {
            "nmds": ret_table,
            "nmds1Max": np.max(npos[:, 0]) * buffer,
            "nmds1Min": np.min(npos[:, 0]) * buffer,
            "nmds2Max": np.max(npos[:, 1]) * buffer,
            "nmds2Min": np.min(npos[:, 1]) * buffer
        }
        return abundancesObj
예제 #41
0
파일: UniPCoA.py 프로젝트: AdeBC/UniPCoA
	otu_ids = X.columns.tolist()
	X = X.reset_index().melt(id_vars=['index'], value_vars=X.columns, var_name='taxonomy', value_name='abundance')
	taxa = pd.DataFrame(X.taxonomy.apply(lambda x: dict(map(lambda y: y.split('__'), filter(lambda x: not x.endswith('__'), x.split(';'))))).tolist())
	X = pd.concat([X.drop(columns=['taxonomy']), taxa], axis=1)
	X = X.melt(id_vars=['index','abundance'], value_vars=taxa.columns, var_name='rank', value_name='taxonomy')
	X = X.groupby(by=['index', 'taxonomy'], as_index=False).sum().pivot_table(columns='taxonomy', index='index', values='abundance')
	if use_phylogeny:
		X = X.loc[:, X.columns.to_series().isin(names)]
	ids = X.index.tolist()
	otu_ids = X.columns.tolist()
	
	try:
		print('Trying calculating {} beta_diversity using scikit-bio & scikit-learn package...'.format(args.metric))
		print('This could be time-consuming.')
		if use_phylogeny:
			mat = beta_diversity(args.metric, X, ids, tree=tree, otu_ids=otu_ids, validate=False).data
		else:
			mat = beta_diversity(args.metric, X, ids, otu_ids=otu_ids, validate=False).data
	except ValueError:
		print('Failed, the metric you selected is not supported by neither scikit-bio nor scikit-learn.')
		print('Trying using SciPy...')
		mat = squareform(pdist(X, metric=args.metric))
	print('Succeeded!')	
	pcs = pd.DataFrame(pcoa(mat, number_of_dimensions=2).samples.values.tolist(), index=X.index, columns=['PC1', 'PC2'])
	pcs = pd.concat([pcs, Y], axis=1)
	print('Visualizing the data using plotnine package...')
	p = (ggplot(pcs, aes(x='PC1', y='PC2', color='Env'))
			+ geom_point(size=0.2)
			+ scale_color_manual(['#E64B35FF','#4DBBD5FF','#00A087FF','#3C5488FF','#F39B7FFF','#8491B4FF','#91D1C2FF'])
			+ theme(panel_grid_major = element_blank(), panel_grid_minor = element_blank(), panel_background = element_blank())
			+ theme(axis_line = element_line(color="gray", size = 1))
예제 #42
0
def PCoA_analysis(TaXon_table_xlsx, meta_data_to_test, taxonomic_level, width,
                  height, pcoa_s, path_to_outdirs, template, font_size,
                  color_discrete_sequence, pcoa_dissimilarity):
    import pandas as pd
    import numpy as np
    from skbio.diversity import beta_diversity
    from skbio.stats.ordination import pcoa
    from skbio.stats.distance import anosim
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    import plotly.express as px
    from pathlib import Path
    import PySimpleGUI as sg
    import os, webbrowser
    from itertools import combinations

    TaXon_table_xlsx = Path(TaXon_table_xlsx)
    Meta_data_table_xlsx = Path(
        str(path_to_outdirs) + "/" + "Meta_data_table" + "/" +
        TaXon_table_xlsx.stem + "_metadata.xlsx")
    TaXon_table_df = pd.read_excel(TaXon_table_xlsx,
                                   header=0).fillna("unidentified")
    TaXon_table_samples = TaXon_table_df.columns.tolist()[10:]
    Meta_data_table_df = pd.read_excel(Meta_data_table_xlsx,
                                       header=0).fillna("nan")
    Meta_data_table_samples = Meta_data_table_df['Samples'].tolist()

    metadata_list = Meta_data_table_df[meta_data_to_test].values.tolist()
    metadata_loc = Meta_data_table_df.columns.tolist().index(meta_data_to_test)

    ## drop samples with metadata called nan (= empty)
    drop_samples = [
        i[0] for i in Meta_data_table_df.values.tolist()
        if i[metadata_loc] == "nan"
    ]

    if drop_samples != []:
        ## filter the TaXon table
        TaXon_table_df = TaXon_table_df.drop(drop_samples, axis=1)
        TaXon_table_samples = TaXon_table_df.columns.tolist()[10:]
        ## also remove empty OTUs
        row_filter_list = []
        for row in TaXon_table_df.values.tolist():
            reads = set(row[10:])
            if reads != {0}:
                row_filter_list.append(row)
        columns = TaXon_table_df.columns.tolist()
        TaXon_table_df = pd.DataFrame(row_filter_list, columns=columns)
        Meta_data_table_df = pd.DataFrame(
            [
                i for i in Meta_data_table_df.values.tolist()
                if i[0] not in drop_samples
            ],
            columns=Meta_data_table_df.columns.tolist())
        Meta_data_table_samples = Meta_data_table_df['Samples'].tolist()

    ## create a y axis title text
    taxon_title = taxonomic_level.lower()

    ## adjust taxonomic level if neccessary
    if taxonomic_level in ["ASVs", "ESVs", "OTUs", "zOTUs"]:
        taxon_title = taxonomic_level
        taxonomic_level = "ID"

    # check if the meta data differs
    if len(set(Meta_data_table_df[meta_data_to_test])) == len(
            Meta_data_table_df['Samples'].tolist()):
        sg.Popup(
            "The meta data is unique for all samples. Please adjust the meta data table!",
            title=("Error"))
        raise RuntimeError

    # check if the meta data differs
    if len(set(Meta_data_table_df[meta_data_to_test])) == 1:
        sg.Popup(
            "The meta data is similar for all samples. Please adjust the meta data table!",
            title=("Error"))
        raise RuntimeError

    if sorted(TaXon_table_samples) == sorted(Meta_data_table_samples):

        samples = Meta_data_table_samples

        ## extract the relevant data
        TaXon_table_df = TaXon_table_df[[taxonomic_level] + samples]
        ## define an aggregation function to combine multiple hit of one taxonimic level
        aggregation_functions = {}
        ## define samples functions
        for sample in samples:
            ## 'sum' will calculate the sum of p/a data
            aggregation_functions[sample] = 'sum'
        ## define taxon level function
        aggregation_functions[taxonomic_level] = 'first'
        ## create condensed dataframe
        TaXon_table_df = TaXon_table_df.groupby(
            TaXon_table_df[taxonomic_level]).aggregate(aggregation_functions)
        if 'unidentified' in TaXon_table_df.index:
            TaXon_table_df = TaXon_table_df.drop('unidentified')

        data = TaXon_table_df[samples].transpose().values.tolist()
        jc_dm = beta_diversity(pcoa_dissimilarity, data, samples)
        ordination_result = pcoa(jc_dm)
        metadata_list = Meta_data_table_df[meta_data_to_test].values.tolist()

        anosim_results = anosim(jc_dm, metadata_list, permutations=999)
        anosim_r = round(anosim_results['test statistic'], 5)
        anosim_p = anosim_results['p-value']
        textbox = meta_data_to_test + ", " + taxon_title + "<br>Anosim " + "R = " + str(
            anosim_r) + " " + "p = " + str(anosim_p)

        #######################################################################################
        # create window to ask for PCoA axis to test
        def slices(list, slice):
            for i in range(0, len(list), slice):
                yield list[i:i + slice]

        # collect the PCoA proportion explained values
        proportion_explained_list = []
        for i, pcoa_axis in enumerate(ordination_result.proportion_explained):
            if round(pcoa_axis * 100, 2) >= 1:
                proportion_explained_list.append("PC" + str(i + 1) + " (" +
                                                 str(round(pcoa_axis *
                                                           100, 2)) + " %)")

        pcoa_axis_checkboxes = list(
            slices([
                sg.Checkbox(name, key=name, size=(15, 1))
                for name in proportion_explained_list
            ], 10))

        pcoa_window_layout = [
            [sg.Text('Check up to four axes to be displayed')],
            [sg.Frame(layout=pcoa_axis_checkboxes, title='')],
            [sg.Text('Only axes >= 1 % explained variance are shown')],
            [sg.CB("Connect categories", default=True, key="draw_mesh")],
            [sg.Text('')],
            [sg.Button('Plot', key='Plot')],
            [sg.Button('Back')],
        ]

        pcoa_window = sg.Window('PCoA axis',
                                pcoa_window_layout,
                                keep_on_top=True)

        while True:
            event, values = pcoa_window.read()

            draw_mesh = values["draw_mesh"]

            if event is None or event == 'Back':
                break

            if event == 'Plot':

                ## create a subfolder for better sorting and overview
                dirName = Path(
                    str(path_to_outdirs) + "/" + "PCoA_plots" + "/" +
                    TaXon_table_xlsx.stem + "/")
                if not os.path.exists(dirName):
                    os.mkdir(dirName)

                # collect the pcoa axis values
                axis_to_plot = [
                    key for key, value in values.items()
                    if value == True and "PC" in key
                ]
                # pass on only if two pcoa axes were checked
                if len(axis_to_plot) == 2:
                    cat1 = axis_to_plot[1].split()[0]
                    cat2 = axis_to_plot[0].split()[0]

                    df_pcoa = ordination_result.samples[[cat1, cat2]]
                    df_pcoa.insert(
                        2, "Metadata",
                        Meta_data_table_df[meta_data_to_test].values.tolist(),
                        True)
                    df_pcoa.insert(
                        3, "Samples",
                        Meta_data_table_df["Samples"].values.tolist(), True)

                    if draw_mesh == True:
                        combinations_list = []
                        for metadata in df_pcoa["Metadata"]:
                            ## collect all entries for the respective metadata
                            arr = df_pcoa.loc[df_pcoa['Metadata'] == metadata][
                                [cat1, cat2, "Metadata",
                                 "Samples"]].to_numpy()
                            ## create a df for all possible combinations using itertools combinations
                            for entry in list(combinations(arr, 2)):
                                combinations_list.append(list(entry[0]))
                                combinations_list.append(list(entry[1]))
                        ## create a dataframe to draw the plot from
                        df = pd.DataFrame(combinations_list)
                        df.columns = [cat1, cat2, "Metadata", "Samples"]

                        fig = px.scatter(
                            df,
                            x=cat1,
                            y=cat2,
                            color="Metadata",
                            text="Samples",
                            title=textbox,
                            color_discrete_sequence=color_discrete_sequence)
                        fig.update_traces(marker_size=int(pcoa_s),
                                          mode="markers+lines")
                        fig.update_layout(height=int(height),
                                          width=int(width),
                                          template=template,
                                          showlegend=True,
                                          font_size=font_size,
                                          title_font_size=font_size)
                        fig.update_xaxes(title=axis_to_plot[1])
                        fig.update_yaxes(title=axis_to_plot[0])

                    else:
                        fig = px.scatter(
                            df_pcoa,
                            x=cat1,
                            y=cat2,
                            color="Metadata",
                            text="Samples",
                            title=textbox,
                            color_discrete_sequence=color_discrete_sequence)
                        fig.update_traces(marker_size=int(pcoa_s),
                                          mode="markers")
                        fig.update_layout(height=int(height),
                                          width=int(width),
                                          template=template,
                                          showlegend=True,
                                          font_size=font_size,
                                          title_font_size=font_size)
                        fig.update_xaxes(title=axis_to_plot[1])
                        fig.update_yaxes(title=axis_to_plot[0])

                    ## define output files
                    output_pdf = Path(
                        str(dirName) + "/" + meta_data_to_test + "_" +
                        taxon_title + ".pdf")
                    output_html = Path(
                        str(dirName) + "/" + meta_data_to_test + "_" +
                        taxon_title + ".html")
                    output_xlsx = Path(
                        str(dirName) + "/" + meta_data_to_test + "_" +
                        taxon_title + ".xlsx")

                    ## write files
                    fig.write_image(str(output_pdf))
                    fig.write_html(str(output_html))
                    ordination_result.samples[[cat1,
                                               cat2]].to_excel(output_xlsx)

                    ## ask to show file
                    answer = sg.PopupYesNo('Show plot?', keep_on_top=True)
                    if answer == "Yes":
                        webbrowser.open('file://' + str(output_html))

                    ## print closing text
                    closing_text = "\n" + "PCoA plots are found in: " + str(
                        path_to_outdirs) + "/PCoA_plots/"
                    sg.Popup(closing_text, title="Finished", keep_on_top=True)

                    ## write to log
                    from taxontabletools.create_log import ttt_log
                    ttt_log("pcoa analysis", "analysis", TaXon_table_xlsx.name,
                            output_pdf.name, meta_data_to_test,
                            path_to_outdirs)
                    break

                elif len(axis_to_plot) == 3:
                    cat1 = axis_to_plot[0].split()[0]
                    cat2 = axis_to_plot[1].split()[0]
                    cat3 = axis_to_plot[2].split()[0]

                    df_pcoa = ordination_result.samples[[cat1, cat2, cat3]]
                    df_pcoa.insert(
                        3, "Metadata",
                        Meta_data_table_df[meta_data_to_test].values.tolist(),
                        True)
                    df_pcoa.insert(
                        4, "Samples",
                        Meta_data_table_df["Samples"].values.tolist(), True)

                    ## check if lines are to be drawn between the dots
                    if draw_mesh == True:
                        combinations_list = []
                        for metadata in df_pcoa["Metadata"]:
                            ## collect all entries for the respective metadata
                            arr = df_pcoa.loc[df_pcoa['Metadata'] == metadata][
                                [cat1, cat2, cat3, "Metadata",
                                 "Samples"]].to_numpy()
                            ## create a df for all possible combinations using itertools combinations
                            for entry in list(combinations(arr, 2)):
                                combinations_list.append(list(entry[0]))
                                combinations_list.append(list(entry[1]))
                        ## create a dataframe to draw the plot from
                        df = pd.DataFrame(combinations_list)
                        df.columns = [cat1, cat2, cat3, "Metadata", "Samples"]
                        ## draw the plot
                        fig = px.scatter_3d(
                            df,
                            x=cat1,
                            y=cat2,
                            z=cat3,
                            color="Metadata",
                            text="Samples",
                            title=textbox,
                            color_discrete_sequence=color_discrete_sequence)
                        fig.update_traces(marker_size=int(pcoa_s),
                                          mode="markers+lines",
                                          line=dict(width=0.5))
                        fig.update_layout(height=int(height),
                                          width=int(width),
                                          template=template,
                                          title=textbox,
                                          showlegend=True,
                                          font_size=font_size,
                                          title_font_size=font_size)
                        fig.update_layout(
                            scene=dict(xaxis_title=axis_to_plot[0],
                                       yaxis_title=axis_to_plot[1],
                                       zaxis_title=axis_to_plot[2]))
                    else:
                        fig = px.scatter_3d(
                            df_pcoa,
                            x=cat1,
                            y=cat2,
                            z=cat3,
                            color="Metadata",
                            text="Samples",
                            color_discrete_sequence=color_discrete_sequence)
                        fig.update_traces(marker_size=int(pcoa_s),
                                          mode="markers")
                        fig.update_layout(height=int(height),
                                          width=int(width),
                                          template=template,
                                          showlegend=True,
                                          title=textbox,
                                          font_size=font_size,
                                          title_font_size=font_size)
                        fig.update_layout(
                            scene=dict(xaxis_title=axis_to_plot[0],
                                       yaxis_title=axis_to_plot[1],
                                       zaxis_title=axis_to_plot[2]))

                    ## define output files
                    output_pdf = Path(
                        str(dirName) + "/" + meta_data_to_test + "_" +
                        taxon_title + "_3d.pdf")
                    output_html = Path(
                        str(dirName) + "/" + meta_data_to_test + "_" +
                        taxon_title + "_3d.html")
                    output_xlsx = Path(
                        str(dirName) + "/" + meta_data_to_test + "_" +
                        taxon_title + "_3d.xlsx")

                    ## write output files
                    fig.write_image(str(output_pdf))
                    fig.write_html(str(output_html))
                    ordination_result.samples[[cat1,
                                               cat2]].to_excel(output_xlsx)

                    ## ask to show file
                    answer = sg.PopupYesNo('Show plot?', keep_on_top=True)
                    if answer == "Yes":
                        webbrowser.open('file://' + str(output_html))

                    ## print closing text
                    closing_text = "PCoA plots are found in: " + str(
                        path_to_outdirs) + "/PCoA_plots/"
                    sg.Popup(closing_text, title="Finished", keep_on_top=True)

                    ## write log file
                    from taxontabletools.create_log import ttt_log
                    ttt_log("pcoa analysis", "analysis", TaXon_table_xlsx.name,
                            output_pdf.name, meta_data_to_test,
                            path_to_outdirs)
                    break

                else:
                    sg.Popup("Please choose not more than 3 PCoA axes",
                             title="Error",
                             keep_on_top=True)

            if event == 'Plot matrix':
                if len(proportion_explained_list) >= 4:

                    ## create a subfolder for better sorting and overview
                    dirName = Path(
                        str(path_to_outdirs) + "/" + "PCoA_plots" + "/" +
                        TaXon_table_xlsx.stem + "/")
                    if not os.path.exists(dirName):
                        os.mkdir(dirName)

                    df_pcoa = ordination_result.samples[[
                        "PC1", "PC2", "PC3", "PC4"
                    ]]
                    df_pcoa.insert(
                        4, "Metadata",
                        Meta_data_table_df[meta_data_to_test].values.tolist(),
                        True)
                    df_pcoa.insert(
                        5, "Sample",
                        Meta_data_table_df["Samples"].values.tolist(), True)

                    fig = make_subplots(rows=4, cols=4)
                    ########### 1 ###########
                    fig.add_trace(go.Scatter(), row=1, col=1)
                    fig.update_layout(template=template,
                                      font_size=font_size,
                                      title_font_size=font_size)
                    text = "PC1 (" + str(
                        round(
                            ordination_result.proportion_explained["PC1"] *
                            100, 2)) + " %)"
                    fig.add_annotation(text=text, showarrow=False)
                    fig.update_xaxes(showticklabels=False, showgrid=False)
                    fig.update_yaxes(showticklabels=False, showgrid=False)
                    ########### 2 ###########
                    df = df_pcoa[["PC1", "PC2", "Metadata", "Sample"]]
                    for metadata in set(metadata_list):
                        df_metadata = df[df['Metadata'] == metadata]
                        #fig = px.scatter(df_pcoa, x="PC1", y="PC2", , )
                        fig.add_trace(go.Scatter(
                            x=df_metadata["PC1"].values.tolist(),
                            y=df_metadata["PC2"].values.tolist(),
                            mode='markers',
                            name=metadata,
                            text=df_metadata["Sample"].values.tolist()),
                                      row=1,
                                      col=2)
                    ########### 3 ###########
                    df = df_pcoa[["PC1", "PC3", "Metadata", "Sample"]]
                    for metadata in set(metadata_list):
                        df_metadata = df[df['Metadata'] == metadata]
                        #fig = px.scatter(df_pcoa, x="PC1", y="PC2", , )
                        fig.add_trace(go.Scatter(
                            x=df_metadata["PC1"].values.tolist(),
                            y=df_metadata["PC3"].values.tolist(),
                            mode='markers',
                            name=metadata,
                            showlegend=False,
                            text=df_metadata["Sample"].values.tolist()),
                                      row=1,
                                      col=3)
                    ########### 4 ###########
                    df = df_pcoa[["PC1", "PC4", "Metadata", "Sample"]]
                    for metadata in set(metadata_list):
                        df_metadata = df[df['Metadata'] == metadata]
                        fig.add_trace(go.Scatter(
                            x=df_metadata["PC1"].values.tolist(),
                            y=df_metadata["PC4"].values.tolist(),
                            mode='markers',
                            name=metadata,
                            showlegend=False,
                            text=df_metadata["Sample"].values.tolist()),
                                      row=1,
                                      col=4)
                        fig.update_traces(marker_size=int(pcoa_s),
                                          mode="markers")
                        fig.update_xaxes(showgrid=False, row=1, col=4)
                        fig.update_yaxes(showgrid=False, row=1, col=4)
                    ########### 5 ###########
                    fig.add_trace(go.Scatter(), row=2, col=2)
                    fig.update_layout(template=template,
                                      font_size=font_size,
                                      title_font_size=font_size)
                    text = "PC2 (" + str(
                        round(
                            ordination_result.proportion_explained["PC2"] *
                            100, 2)) + " %)"
                    fig.add_annotation(text=text,
                                       showarrow=False,
                                       row=2,
                                       col=2)
                    ########### 6 ###########
                    df = df_pcoa[["PC2", "PC3", "Metadata", "Sample"]]
                    for metadata in set(metadata_list):
                        df_metadata = df[df['Metadata'] == metadata]
                        #fig = px.scatter(df_pcoa, x="PC1", y="PC2", , )
                        fig.add_trace(go.Scatter(
                            x=df_metadata["PC2"].values.tolist(),
                            y=df_metadata["PC3"].values.tolist(),
                            mode='markers',
                            name=metadata,
                            showlegend=False,
                            text=df_metadata["Sample"].values.tolist()),
                                      row=2,
                                      col=3)
                    ########### 7 ###########
                    df = df_pcoa[["PC2", "PC4", "Metadata", "Sample"]]
                    for metadata in set(metadata_list):
                        df_metadata = df[df['Metadata'] == metadata]
                        fig.add_trace(go.Scatter(
                            x=df_metadata["PC2"].values.tolist(),
                            y=df_metadata["PC4"].values.tolist(),
                            mode='markers',
                            name=metadata,
                            showlegend=False,
                            text=df_metadata["Sample"].values.tolist()),
                                      row=2,
                                      col=4)
                    ########### 8 ###########
                    fig.add_trace(go.Scatter(), row=3, col=3)
                    fig.update_layout(template=template,
                                      font_size=font_size,
                                      title_font_size=font_size)
                    text = "PC3 (" + str(
                        round(
                            ordination_result.proportion_explained["PC3"] *
                            100, 2)) + " %)"
                    fig.add_annotation(text=text,
                                       showarrow=False,
                                       row=3,
                                       col=3)
                    ########### 9 ###########
                    df = df_pcoa[["PC3", "PC4", "Metadata", "Sample"]]
                    for metadata in set(metadata_list):
                        df_metadata = df[df['Metadata'] == metadata]
                        #fig = px.scatter(df_pcoa, x="PC1", y="PC2", , )
                        fig.add_trace(go.Scatter(
                            x=df_metadata["PC3"].values.tolist(),
                            y=df_metadata["PC4"].values.tolist(),
                            mode='markers',
                            name=metadata,
                            showlegend=False,
                            text=df_metadata["Sample"].values.tolist()),
                                      row=3,
                                      col=4)
                    ########### 5 ###########
                    fig.add_trace(go.Scatter(), row=4, col=4)
                    fig.update_layout(template=template,
                                      font_size=font_size,
                                      title_font_size=font_size)
                    text = "PC4 (" + str(
                        round(
                            ordination_result.proportion_explained["PC4"] *
                            100, 2)) + " %)"
                    fig.add_annotation(text=text,
                                       showarrow=False,
                                       row=4,
                                       col=4)

                    ######################
                    fig.update_xaxes(showline=True,
                                     mirror=True,
                                     linewidth=1,
                                     linecolor='black')
                    fig.update_yaxes(showline=True,
                                     mirror=True,
                                     linewidth=1,
                                     linecolor='black')
                    fig.update_traces(marker_size=int(pcoa_s), mode="markers")
                    # finish plot matrix
                    fig.update_layout(height=1000,
                                      width=1000,
                                      title_text=textbox)

                    ## define output files
                    output_pdf = Path(
                        str(dirName) + "/" + meta_data_to_test + "_" +
                        taxon_title + "_matrix.pdf")
                    output_html = Path(
                        str(dirName) + "/" + meta_data_to_test + "_" +
                        taxon_title + "_matrix.html")

                    ## write output files
                    fig.write_image(str(output_pdf))
                    fig.write_html(str(output_html))

                    ## ask to show file
                    answer = sg.PopupYesNo('Show plot?', keep_on_top=True)
                    if answer == "Yes":
                        webbrowser.open('file://' + str(output_html))

                    ## print closing text
                    closing_text = "\n" + "PCoA plots are found in: " + str(
                        path_to_outdirs) + "/PCoA_plots/"
                    sg.Popup(closing_text, title="Finished", keep_on_top=True)

                    ## write to log file
                    from taxontabletools.create_log import ttt_log
                    ttt_log("pcoa analysis", "analysis", TaXon_table_xlsx.name,
                            output_pdf.name, meta_data_to_test,
                            path_to_outdirs)
                    break
                else:
                    sg.Popup(
                        "There must be at least 4 PCoA axis available to plot the matrix!"
                    )

        pcoa_window.close()

    else:
        sg.PopupError(
            "The sample of both the TaXon table and the metadata table have to match!"
        )
예제 #43
0
    def analyse_other(self, user_request, otu_table, headers, sample_labels,
                      metaVals, phylogenetic_tree):
        type = user_request.get_custom_attr("type")

        otu_table = otu_table.astype(int)

        if type == "weighted_unifrac" or type == "unweighted_unifrac":
            if phylogenetic_tree == "":
                return {"no_tree": True}
            tree = TreeNode.read(StringIO(phylogenetic_tree))
            if len(tree.root().children) > 2:
                # Ensure that the tree is rooted if it is not already rooted
                tree = tree.root_at_midpoint()
            dist_matrix = beta_diversity(type,
                                         otu_table,
                                         ids=sample_labels,
                                         otu_ids=headers,
                                         tree=tree)
        else:
            dist_matrix = beta_diversity(type, otu_table)

        results = pcoa(dist_matrix)
        pcaVals = results.samples
        pcaVariances = results.proportion_explained

        logger.info("After running the R PCA")

        pca1Min = 1000000
        pca2Min = 1000000
        pca3Min = 1000000
        pca1Max = 0
        pca2Max = 0
        pca3Max = 0

        pca1 = user_request.get_custom_attr("pca1")
        pca2 = user_request.get_custom_attr("pca2")
        pca3 = user_request.get_custom_attr("pca3")

        pcaRow = []
        i = 0
        while i < len(pcaVals):
            meta = ""
            if metaVals and len(metaVals) == len(pcaVals):
                meta = metaVals[i]

            pcaObj = {
                "s": sample_labels[i],
                "m": meta,
                "pca1": round(pcaVals.iloc[i]["PC" + pca1], 8),
                "pca2": round(pcaVals.iloc[i]["PC" + pca2], 8),
                "pca3": round(pcaVals.iloc[i]["PC" + pca3], 8)
            }
            if pcaObj["pca1"] > pca1Max:
                pca1Max = pcaObj["pca1"]
            if pcaObj["pca1"] < pca1Min:
                pca1Min = pcaObj["pca1"]

            if pcaObj["pca2"] > pca2Max:
                pca2Max = pcaObj["pca2"]
            if pcaObj["pca2"] < pca2Min:
                pca2Min = pcaObj["pca2"]

            if pcaObj["pca3"] > pca3Max:
                pca3Max = pcaObj["pca3"]
            if pcaObj["pca3"] < pca3Min:
                pca3Min = pcaObj["pca3"]

            pcaRow.append(pcaObj)
            i += 1

        i = 0
        pcaVarRow = []
        for p in pcaVariances:
            pcaVarRow.append(float(p) * 100)
            if i > 10:
                break
            i += 1

        abundancesObj = {}
        abundancesObj["pca"] = pcaRow
        abundancesObj["pcaVar"] = pcaVarRow
        abundancesObj["pca1Max"] = pca1Max
        abundancesObj["pca1Min"] = pca1Min
        abundancesObj["pca2Max"] = pca2Max
        abundancesObj["pca2Min"] = pca2Min
        abundancesObj["pca3Max"] = pca3Max
        abundancesObj["pca3Min"] = pca3Min
        return abundancesObj
예제 #44
0
    def test_invalid_input(self):
        # number of ids doesn't match the number of samples
        error_msg = ("Number of rows")
        with self.assertRaisesRegex(ValueError, error_msg):
            beta_diversity(self.table1, list('AB'), 'euclidean')

        # unknown metric provided
        error_msg = "not-a-metric"
        with self.assertRaisesRegex(ValueError, error_msg):
            beta_diversity('not-a-metric', self.table1)

        # 3-D list provided as input
        error_msg = ("Only 1-D and 2-D")
        with self.assertRaisesRegex(ValueError, error_msg):
            beta_diversity('euclidean', [[[43]]])

        # negative counts
        error_msg = "negative values."
        with self.assertRaisesRegex(ValueError, error_msg):
            beta_diversity('euclidean', [[0, 1, 3, 4], [0, 3, -12, 42]])
        with self.assertRaisesRegex(ValueError, error_msg):
            beta_diversity('euclidean', [[0, 1, 3, -4], [0, 3, 12, 42]])

        # additional kwargs
        error_msg = ("'not_a_real_kwarg'")
        with self.assertRaisesRegex(TypeError, error_msg):
            beta_diversity('euclidean', [[0, 1, 3], [0, 3, 12]],
                           not_a_real_kwarg=42.0)
        with self.assertRaisesRegex(TypeError, error_msg):
            beta_diversity('unweighted_unifrac', [[0, 1, 3], [0, 3, 12]],
                           not_a_real_kwarg=42.0, tree=self.tree1,
                           otu_ids=['O1', 'O2', 'O3'])
        with self.assertRaisesRegex(TypeError, error_msg):
            beta_diversity('weighted_unifrac', [[0, 1, 3], [0, 3, 12]],
                           not_a_real_kwarg=42.0, tree=self.tree1,
                           otu_ids=['O1', 'O2', 'O3'])
        with self.assertRaisesRegex(TypeError, error_msg):
            beta_diversity(weighted_unifrac, [[0, 1, 3], [0, 3, 12]],
                           not_a_real_kwarg=42.0, tree=self.tree1,
                           otu_ids=['O1', 'O2', 'O3'])