def test_subsample_neighbors_locale_w_seed(self): exp_metadata = self.context_md1 # since we're setting a random seed, the result we get the first # time is our expected every time exp_sel = subsample_neighbors(self.focal_seqs1, self.context_seqs1, percent_id=0.98, samples_per_cluster=2, locale=self.context_md1.get_column('x'), seed=0) self.assertTrue(exp_sel.inclusion['c1']) self.assertEqual(exp_sel.inclusion.sum(), 3) self.assertEqual(exp_sel.metadata, exp_metadata) for _ in range(self._N_TEST_ITERATIONS): sel = subsample_neighbors(self.focal_seqs1, self.context_seqs1, percent_id=0.98, samples_per_cluster=2, locale=self.context_md1.get_column('x'), seed=0) pdt.assert_series_equal(sel.inclusion, exp_sel.inclusion)
def test_subsample_neighbors_metadata_subset(self): context_md = self.get_data_path('context-metadata-1-missing-id.tsv') context_md = qiime2.Metadata.load(context_md) with self.assertRaisesRegex(ValueError, 'not present in the metadata'): subsample_neighbors(self.focal_seqs1, self.context_seqs1, percent_id=0.98, samples_per_cluster=1, locale=context_md.get_column('x'))
def test_subsample_neighbors_no_locale_alt_samples_per_cluster(self): sel = subsample_neighbors(self.focal_seqs1, self.context_seqs1, percent_id=0.98, samples_per_cluster=3) exp_inclusion = pd.Series([True, True, True, False, True, False], index=['c1', 'c2', 'c3', 'c4', 'c5', 'c6'], name='inclusion') exp_metadata = pd.DataFrame(index=['c1', 'c2', 'c3', 'c4', 'c5', 'c6']) exp_metadata.index.name = 'id' exp_metadata = qiime2.Metadata(exp_metadata) pdt.assert_series_equal(sel.inclusion, exp_inclusion) self.assertEqual(sel.metadata, exp_metadata) self.assertEqual(sel.label, 'subsample_neighbors')
def test_subsample_neighbors_terminal_gaps_ignored(self): sel = subsample_neighbors(self.focal_seqs2, self.context_seqs2, percent_id=1.0, samples_per_cluster=2) exp_inclusion = pd.Series([True], index=['c1'], name='inclusion') exp_metadata = pd.DataFrame(index=['c1']) exp_metadata.index.name = 'id' exp_metadata = qiime2.Metadata(exp_metadata) pdt.assert_series_equal(sel.inclusion, exp_inclusion) self.assertEqual(sel.metadata, exp_metadata) self.assertEqual(sel.label, 'subsample_neighbors')
def test_subsample_neighbors_metadata_superset(self): context_md = self.get_data_path('context-metadata-2-extra-ids.tsv') context_md = qiime2.Metadata.load(context_md) sel = subsample_neighbors(self.focal_seqs2, self.context_seqs2, percent_id=1.0, samples_per_cluster=2, locale=context_md.get_column('x')) exp_inclusion = pd.Series([True], index=['c1'], name='inclusion') exp_metadata = context_md.filter_ids(['c1']) pdt.assert_series_equal(sel.inclusion, exp_inclusion) self.assertEqual(sel.metadata, exp_metadata) self.assertEqual(sel.label, 'subsample_neighbors')
def test_subsample_neighbors_locale(self): count_obs_c2 = 0 count_obs_c3 = 0 count_obs_c4 = 0 count_obs_c5 = 0 exp_metadata = self.context_md1.to_dataframe() exp_metadata.index.name = 'id' exp_metadata = qiime2.Metadata(exp_metadata) for _ in range(self._N_TEST_ITERATIONS): sel = subsample_neighbors(self.focal_seqs1, self.context_seqs1, percent_id=0.98, samples_per_cluster=2, locale=self.context_md1.get_column('x')) obs_sampled_context_seqs = sel.inclusion[sel.inclusion].keys() self.assertTrue('c1' in set(obs_sampled_context_seqs)) self.assertEqual(sel.inclusion.sum(), 3) self.assertEqual(len(sel.inclusion), 6) self.assertEqual(sel.metadata, exp_metadata) self.assertEqual(sel.label, 'subsample_neighbors') if 'c2' in obs_sampled_context_seqs: count_obs_c2 += 1 if 'c3' in obs_sampled_context_seqs: count_obs_c3 += 1 if 'c4' in obs_sampled_context_seqs: count_obs_c4 += 1 if 'c5' in obs_sampled_context_seqs: count_obs_c5 += 1 # since c2, c3, and c5 all have locale "def" and c4 has locale "hijk", # so we expect to see c4 more frequently than any of the other three self.assertTrue(count_obs_c4 > count_obs_c2) self.assertTrue(count_obs_c4 > count_obs_c3) self.assertTrue(count_obs_c4 > count_obs_c5)
def test_subsample_neighbors_invalid_max_accepts(self): with self.assertRaisesRegex(ValueError, 'obtained per cluster'): subsample_neighbors(self.focal_seqs1, self.context_seqs1, percent_id=0.98, samples_per_cluster=11)