def test_leading_trailing_whitespace_column_name(self): col1 = CategoricalMetadataColumn( pd.Series(['foo', ' bar ', 'baz'], name=' col ', index=pd.Index(['a', 'b', 'c'], name='id'))) col2 = CategoricalMetadataColumn( pd.Series(['foo', ' bar ', 'baz'], name='col', index=pd.Index(['a', 'b', 'c'], name='id'))) self.assertEqual(col1, col2)
def test_all_missing_data(self): mdc = CategoricalMetadataColumn(pd.Series( np.array([np.nan, np.nan, np.nan], dtype=object), name='col1', index=pd.Index(['a', 'b', 'c'], name='id'))) obs = mdc.to_series() exp = pd.Series( np.array([np.nan, np.nan, np.nan], dtype=object), name='col1', index=pd.Index(['a', 'b', 'c'], name='id')) pdt.assert_series_equal(obs, exp) self.assertEqual(obs.dtype, object)
def test_all_missing_data(self): mdc = CategoricalMetadataColumn( pd.Series(np.array([np.nan, np.nan, np.nan], dtype=object), name='col1', index=pd.Index(['a', 'b', 'c'], name='id'))) obs = mdc.to_series() exp = pd.Series(np.array([np.nan, np.nan, np.nan], dtype=object), name='col1', index=pd.Index(['a', 'b', 'c'], name='id')) pdt.assert_series_equal(obs, exp) self.assertEqual(obs.dtype, object)
def test_supported_dtype(self): series = pd.Series(['foo', np.nan, 'bar', 'foo'], name='my column', index=pd.Index(['a', 'b', 'c', 'd'], name='id')) mdc = CategoricalMetadataColumn(series) self.assertEqual(mdc.id_count, 4) self.assertEqual(mdc.id_header, 'id') self.assertEqual(mdc.ids, ('a', 'b', 'c', 'd')) self.assertEqual(mdc.name, 'my column') obs_series = mdc.to_series() pdt.assert_series_equal(obs_series, series) self.assertEqual(obs_series.dtype, object)
def test_numeric_strings_preserved_as_strings(self): series = pd.Series(['1', np.nan, '2.5', '3.0'], name='my column', index=pd.Index(['a', 'b', 'c', 'd'], name='id')) mdc = CategoricalMetadataColumn(series) self.assertEqual(mdc.id_count, 4) self.assertEqual(mdc.id_header, 'id') self.assertEqual(mdc.ids, ('a', 'b', 'c', 'd')) self.assertEqual(mdc.name, 'my column') obs_series = mdc.to_series() pdt.assert_series_equal(obs_series, series) self.assertEqual(obs_series.dtype, object)
def test_numeric_strings_preserved_as_strings(self): series = pd.Series( ['1', np.nan, '2.5', '3.0'], name='my column', index=pd.Index(['a', 'b', 'c', 'd'], name='id')) mdc = CategoricalMetadataColumn(series) self.assertEqual(mdc.id_count, 4) self.assertEqual(mdc.id_header, 'id') self.assertEqual(mdc.ids, ('a', 'b', 'c', 'd')) self.assertEqual(mdc.name, 'my column') obs_series = mdc.to_series() pdt.assert_series_equal(obs_series, series) self.assertEqual(obs_series.dtype, object)
def test_supported_dtype(self): series = pd.Series( ['foo', np.nan, 'bar', 'foo'], name='my column', index=pd.Index(['a', 'b', 'c', 'd'], name='id')) mdc = CategoricalMetadataColumn(series) self.assertEqual(mdc.id_count, 4) self.assertEqual(mdc.id_header, 'id') self.assertEqual(mdc.ids, ('a', 'b', 'c', 'd')) self.assertEqual(mdc.name, 'my column') obs_series = mdc.to_series() pdt.assert_series_equal(obs_series, series) self.assertEqual(obs_series.dtype, object)
def test_leading_trailing_whitespace_value(self): with self.assertRaisesRegex( ValueError, "CategoricalMetadataColumn.*leading or trailing " "whitespace characters.*Column 'col1'.*' bar '"): CategoricalMetadataColumn( pd.Series(['foo', ' bar ', 'baz'], name='col1', index=pd.Index(['a', 'b', 'c'], name='id')))
def test_empty_str_value(self): with self.assertRaisesRegex( ValueError, "CategoricalMetadataColumn.*empty strings.*" "column 'col1'"): CategoricalMetadataColumn( pd.Series(['foo', '', 'bar'], name='col1', index=pd.Index(['a', 'b', 'c'], name='id')))
def test_unsupported_type_value(self): with self.assertRaisesRegex( TypeError, "CategoricalMetadataColumn.*strings or missing " r"values.*42\.5.*float.*'col1'"): CategoricalMetadataColumn( pd.Series(['foo', 'bar', 42.5], name='col1', index=pd.Index(['a', 'b', 'c'], name='id')))
def test_unsupported_dtype(self): with self.assertRaisesRegex( TypeError, "CategoricalMetadataColumn 'col1' does not support" ".*Series.*dtype.*float64"): CategoricalMetadataColumn( pd.Series([42.5, 42.6, 42.7], name='col1', index=pd.Index(['a', 'b', 'c'], name='id')))
def test_missing_data_normalized(self): # Different missing data representations should be normalized to np.nan mdc = CategoricalMetadataColumn(pd.Series( [np.nan, 'foo', float('nan'), None], name='col1', index=pd.Index(['a', 'b', 'c', 'd'], name='id'))) obs = mdc.to_series() exp = pd.Series( [np.nan, 'foo', np.nan, np.nan], name='col1', index=pd.Index(['a', 'b', 'c', 'd'], name='id')) pdt.assert_series_equal(obs, exp) self.assertEqual(obs.dtype, object) self.assertTrue(np.isnan(obs['a'])) self.assertTrue(np.isnan(obs['c'])) self.assertTrue(np.isnan(obs['d']))
def test_missing_data_normalized(self): # Different missing data representations should be normalized to np.nan mdc = CategoricalMetadataColumn( pd.Series([np.nan, 'foo', float('nan'), None], name='col1', index=pd.Index(['a', 'b', 'c', 'd'], name='id'))) obs = mdc.to_series() exp = pd.Series([np.nan, 'foo', np.nan, np.nan], name='col1', index=pd.Index(['a', 'b', 'c', 'd'], name='id')) pdt.assert_series_equal(obs, exp) self.assertEqual(obs.dtype, object) self.assertTrue(np.isnan(obs['a'])) self.assertTrue(np.isnan(obs['c'])) self.assertTrue(np.isnan(obs['d']))
def test_wrong_obj(self): with self.assertRaisesRegex( TypeError, 'NumericMetadataColumn constructor.*pandas.Series'): NumericMetadataColumn(pd.DataFrame([[1, 2, 3]])) with self.assertRaisesRegex( TypeError, 'CategoricalMetadataColumn constructor.*pandas.Series'): CategoricalMetadataColumn({})
def test_categorical_metadata_column(self): mdc = CategoricalMetadataColumn( pd.Series(['foo', 'bar', '42.50'], name='categorical-column', index=pd.Index(['id1', 'id2', 'id3'], name='id'))) mdc.save(self.filepath) with open(self.filepath, 'r') as fh: obs = fh.read() exp = ("id\tcategorical-column\n" "#q2:types\tcategorical\n" "id1\tfoo\n" "id2\tbar\n" "id3\t42.50\n") self.assertEqual(obs, exp)
def test_categorical_metadata_column(self): mdc = CategoricalMetadataColumn(pd.Series( ['foo', 'bar', '42.50'], name='categorical-column', index=pd.Index(['id1', 'id2', 'id3'], name='id'))) mdc.save(self.filepath) with open(self.filepath, 'r') as fh: obs = fh.read() exp = ( "id\tcategorical-column\n" "#q2:types\tcategorical\n" "id1\tfoo\n" "id2\tbar\n" "id3\t42.50\n" ) self.assertEqual(obs, exp)
def subsample_neighbors(focal_seqs: DNAFASTAFormat, context_seqs: DNAFASTAFormat, percent_id: float, samples_per_cluster: int, locale: CategoricalMetadataColumn = None, max_accepts: int = 10, n_threads: int = 1, seed: int = None) -> IDSelection: if max_accepts < samples_per_cluster: raise ValueError('max_accepts (%d) must be greater than or equal to ' 'samples_per_cluster (%d), since it is determines ' 'the largest number of samples that could be ' 'obtained per cluster.' % (max_accepts, samples_per_cluster)) context_ids = ids_from_fasta(str(context_seqs)) inclusion = pd.Series(False, index=context_ids, name='inclusion') if locale is not None: locale = locale.filter_ids(inclusion.index).to_series() metadata = pd.DataFrame(locale) else: metadata = pd.DataFrame(index=pd.Index(inclusion.index)) metadata.index.name = 'id' with tempfile.NamedTemporaryFile() as vsearch_out_f: command = [ 'vsearch', '--threads', str(n_threads), '--usearch_global', str(focal_seqs), '--id', str(percent_id), '--db', str(context_seqs), '--userout', vsearch_out_f.name, '--qmask', 'none', '--maxaccepts', str(max_accepts), '--uc_allhits', '--userfields', 'query+target+mism' ] run_command(command) vsearch_out = pd.read_csv( open(vsearch_out_f.name), sep='\t', na_values='*', names=['focal_id', 'context_id', 'n_mismatches']) clusters = _clusters_from_vsearch_out(vsearch_out, locale) context_seqs_to_keep = \ _sample_clusters(clusters, samples_per_cluster, seed=seed) inclusion[context_seqs_to_keep] = True return IDSelection(inclusion, qiime2.Metadata(metadata), "subsample_neighbors")
def test_type_mismatch(self): dummy = DummyMetadataColumn( pd.Series([1.0, 2.0, 3.0], name='col1', index=pd.Index(['id1', 'id2', 'id3'], name='id'))) numeric = NumericMetadataColumn( pd.Series([1.0, 2.0, 3.0], name='col1', index=pd.Index(['id1', 'id2', 'id3'], name='id'))) categorical = CategoricalMetadataColumn( pd.Series(['a', 'b', 'c'], name='col1', index=pd.Index(['id1', 'id2', 'id3'], name='id'))) self.assertReallyNotEqual(dummy, numeric) self.assertReallyNotEqual(dummy, categorical)