Exemplo n.º 1
0
    def test_leading_trailing_whitespace_column_name(self):
        col1 = CategoricalMetadataColumn(
            pd.Series(['foo', ' bar ', 'baz'],
                      name=' col ',
                      index=pd.Index(['a', 'b', 'c'], name='id')))
        col2 = CategoricalMetadataColumn(
            pd.Series(['foo', ' bar ', 'baz'],
                      name='col',
                      index=pd.Index(['a', 'b', 'c'], name='id')))

        self.assertEqual(col1, col2)
Exemplo n.º 2
0
    def test_all_missing_data(self):
        mdc = CategoricalMetadataColumn(pd.Series(
            np.array([np.nan, np.nan, np.nan], dtype=object), name='col1',
            index=pd.Index(['a', 'b', 'c'], name='id')))

        obs = mdc.to_series()

        exp = pd.Series(
            np.array([np.nan, np.nan, np.nan], dtype=object), name='col1',
            index=pd.Index(['a', 'b', 'c'], name='id'))

        pdt.assert_series_equal(obs, exp)
        self.assertEqual(obs.dtype, object)
Exemplo n.º 3
0
    def test_all_missing_data(self):
        mdc = CategoricalMetadataColumn(
            pd.Series(np.array([np.nan, np.nan, np.nan], dtype=object),
                      name='col1',
                      index=pd.Index(['a', 'b', 'c'], name='id')))

        obs = mdc.to_series()

        exp = pd.Series(np.array([np.nan, np.nan, np.nan], dtype=object),
                        name='col1',
                        index=pd.Index(['a', 'b', 'c'], name='id'))

        pdt.assert_series_equal(obs, exp)
        self.assertEqual(obs.dtype, object)
Exemplo n.º 4
0
    def test_supported_dtype(self):
        series = pd.Series(['foo', np.nan, 'bar', 'foo'],
                           name='my column',
                           index=pd.Index(['a', 'b', 'c', 'd'], name='id'))
        mdc = CategoricalMetadataColumn(series)

        self.assertEqual(mdc.id_count, 4)
        self.assertEqual(mdc.id_header, 'id')
        self.assertEqual(mdc.ids, ('a', 'b', 'c', 'd'))
        self.assertEqual(mdc.name, 'my column')

        obs_series = mdc.to_series()
        pdt.assert_series_equal(obs_series, series)
        self.assertEqual(obs_series.dtype, object)
Exemplo n.º 5
0
    def test_numeric_strings_preserved_as_strings(self):
        series = pd.Series(['1', np.nan, '2.5', '3.0'],
                           name='my column',
                           index=pd.Index(['a', 'b', 'c', 'd'], name='id'))
        mdc = CategoricalMetadataColumn(series)

        self.assertEqual(mdc.id_count, 4)
        self.assertEqual(mdc.id_header, 'id')
        self.assertEqual(mdc.ids, ('a', 'b', 'c', 'd'))
        self.assertEqual(mdc.name, 'my column')

        obs_series = mdc.to_series()
        pdt.assert_series_equal(obs_series, series)
        self.assertEqual(obs_series.dtype, object)
Exemplo n.º 6
0
    def test_numeric_strings_preserved_as_strings(self):
        series = pd.Series(
            ['1', np.nan, '2.5', '3.0'], name='my column',
            index=pd.Index(['a', 'b', 'c', 'd'], name='id'))
        mdc = CategoricalMetadataColumn(series)

        self.assertEqual(mdc.id_count, 4)
        self.assertEqual(mdc.id_header, 'id')
        self.assertEqual(mdc.ids, ('a', 'b', 'c', 'd'))
        self.assertEqual(mdc.name, 'my column')

        obs_series = mdc.to_series()
        pdt.assert_series_equal(obs_series, series)
        self.assertEqual(obs_series.dtype, object)
Exemplo n.º 7
0
    def test_supported_dtype(self):
        series = pd.Series(
            ['foo', np.nan, 'bar', 'foo'], name='my column',
            index=pd.Index(['a', 'b', 'c', 'd'], name='id'))
        mdc = CategoricalMetadataColumn(series)

        self.assertEqual(mdc.id_count, 4)
        self.assertEqual(mdc.id_header, 'id')
        self.assertEqual(mdc.ids, ('a', 'b', 'c', 'd'))
        self.assertEqual(mdc.name, 'my column')

        obs_series = mdc.to_series()
        pdt.assert_series_equal(obs_series, series)
        self.assertEqual(obs_series.dtype, object)
Exemplo n.º 8
0
 def test_leading_trailing_whitespace_value(self):
     with self.assertRaisesRegex(
             ValueError, "CategoricalMetadataColumn.*leading or trailing "
             "whitespace characters.*Column 'col1'.*' bar '"):
         CategoricalMetadataColumn(
             pd.Series(['foo', ' bar ', 'baz'],
                       name='col1',
                       index=pd.Index(['a', 'b', 'c'], name='id')))
Exemplo n.º 9
0
 def test_empty_str_value(self):
     with self.assertRaisesRegex(
             ValueError, "CategoricalMetadataColumn.*empty strings.*"
             "column 'col1'"):
         CategoricalMetadataColumn(
             pd.Series(['foo', '', 'bar'],
                       name='col1',
                       index=pd.Index(['a', 'b', 'c'], name='id')))
Exemplo n.º 10
0
 def test_unsupported_type_value(self):
     with self.assertRaisesRegex(
             TypeError, "CategoricalMetadataColumn.*strings or missing "
             r"values.*42\.5.*float.*'col1'"):
         CategoricalMetadataColumn(
             pd.Series(['foo', 'bar', 42.5],
                       name='col1',
                       index=pd.Index(['a', 'b', 'c'], name='id')))
Exemplo n.º 11
0
 def test_unsupported_dtype(self):
     with self.assertRaisesRegex(
             TypeError, "CategoricalMetadataColumn 'col1' does not support"
             ".*Series.*dtype.*float64"):
         CategoricalMetadataColumn(
             pd.Series([42.5, 42.6, 42.7],
                       name='col1',
                       index=pd.Index(['a', 'b', 'c'], name='id')))
Exemplo n.º 12
0
    def test_missing_data_normalized(self):
        # Different missing data representations should be normalized to np.nan
        mdc = CategoricalMetadataColumn(pd.Series(
            [np.nan, 'foo', float('nan'), None], name='col1',
            index=pd.Index(['a', 'b', 'c', 'd'], name='id')))

        obs = mdc.to_series()

        exp = pd.Series(
            [np.nan, 'foo', np.nan, np.nan], name='col1',
            index=pd.Index(['a', 'b', 'c', 'd'], name='id'))

        pdt.assert_series_equal(obs, exp)
        self.assertEqual(obs.dtype, object)
        self.assertTrue(np.isnan(obs['a']))
        self.assertTrue(np.isnan(obs['c']))
        self.assertTrue(np.isnan(obs['d']))
Exemplo n.º 13
0
    def test_missing_data_normalized(self):
        # Different missing data representations should be normalized to np.nan
        mdc = CategoricalMetadataColumn(
            pd.Series([np.nan, 'foo', float('nan'), None],
                      name='col1',
                      index=pd.Index(['a', 'b', 'c', 'd'], name='id')))

        obs = mdc.to_series()

        exp = pd.Series([np.nan, 'foo', np.nan, np.nan],
                        name='col1',
                        index=pd.Index(['a', 'b', 'c', 'd'], name='id'))

        pdt.assert_series_equal(obs, exp)
        self.assertEqual(obs.dtype, object)
        self.assertTrue(np.isnan(obs['a']))
        self.assertTrue(np.isnan(obs['c']))
        self.assertTrue(np.isnan(obs['d']))
Exemplo n.º 14
0
    def test_wrong_obj(self):
        with self.assertRaisesRegex(
                TypeError, 'NumericMetadataColumn constructor.*pandas.Series'):
            NumericMetadataColumn(pd.DataFrame([[1, 2, 3]]))

        with self.assertRaisesRegex(
                TypeError,
                'CategoricalMetadataColumn constructor.*pandas.Series'):
            CategoricalMetadataColumn({})
Exemplo n.º 15
0
    def test_categorical_metadata_column(self):
        mdc = CategoricalMetadataColumn(
            pd.Series(['foo', 'bar', '42.50'],
                      name='categorical-column',
                      index=pd.Index(['id1', 'id2', 'id3'], name='id')))

        mdc.save(self.filepath)

        with open(self.filepath, 'r') as fh:
            obs = fh.read()

        exp = ("id\tcategorical-column\n"
               "#q2:types\tcategorical\n"
               "id1\tfoo\n"
               "id2\tbar\n"
               "id3\t42.50\n")

        self.assertEqual(obs, exp)
Exemplo n.º 16
0
    def test_categorical_metadata_column(self):
        mdc = CategoricalMetadataColumn(pd.Series(
            ['foo', 'bar', '42.50'], name='categorical-column',
            index=pd.Index(['id1', 'id2', 'id3'], name='id')))

        mdc.save(self.filepath)

        with open(self.filepath, 'r') as fh:
            obs = fh.read()

        exp = (
            "id\tcategorical-column\n"
            "#q2:types\tcategorical\n"
            "id1\tfoo\n"
            "id2\tbar\n"
            "id3\t42.50\n"
        )

        self.assertEqual(obs, exp)
Exemplo n.º 17
0
def subsample_neighbors(focal_seqs: DNAFASTAFormat,
                        context_seqs: DNAFASTAFormat,
                        percent_id: float,
                        samples_per_cluster: int,
                        locale: CategoricalMetadataColumn = None,
                        max_accepts: int = 10,
                        n_threads: int = 1,
                        seed: int = None) -> IDSelection:

    if max_accepts < samples_per_cluster:
        raise ValueError('max_accepts (%d) must be greater than or equal to '
                         'samples_per_cluster (%d), since it is determines '
                         'the largest number of samples that could be '
                         'obtained per cluster.' %
                         (max_accepts, samples_per_cluster))

    context_ids = ids_from_fasta(str(context_seqs))

    inclusion = pd.Series(False, index=context_ids, name='inclusion')
    if locale is not None:
        locale = locale.filter_ids(inclusion.index).to_series()
        metadata = pd.DataFrame(locale)
    else:
        metadata = pd.DataFrame(index=pd.Index(inclusion.index))
    metadata.index.name = 'id'

    with tempfile.NamedTemporaryFile() as vsearch_out_f:
        command = [
            'vsearch', '--threads',
            str(n_threads), '--usearch_global',
            str(focal_seqs), '--id',
            str(percent_id), '--db',
            str(context_seqs), '--userout', vsearch_out_f.name, '--qmask',
            'none', '--maxaccepts',
            str(max_accepts), '--uc_allhits', '--userfields',
            'query+target+mism'
        ]
        run_command(command)

        vsearch_out = pd.read_csv(
            open(vsearch_out_f.name),
            sep='\t',
            na_values='*',
            names=['focal_id', 'context_id', 'n_mismatches'])

        clusters = _clusters_from_vsearch_out(vsearch_out, locale)
        context_seqs_to_keep = \
            _sample_clusters(clusters, samples_per_cluster, seed=seed)
        inclusion[context_seqs_to_keep] = True

    return IDSelection(inclusion, qiime2.Metadata(metadata),
                       "subsample_neighbors")
Exemplo n.º 18
0
    def test_type_mismatch(self):
        dummy = DummyMetadataColumn(
            pd.Series([1.0, 2.0, 3.0],
                      name='col1',
                      index=pd.Index(['id1', 'id2', 'id3'], name='id')))
        numeric = NumericMetadataColumn(
            pd.Series([1.0, 2.0, 3.0],
                      name='col1',
                      index=pd.Index(['id1', 'id2', 'id3'], name='id')))
        categorical = CategoricalMetadataColumn(
            pd.Series(['a', 'b', 'c'],
                      name='col1',
                      index=pd.Index(['id1', 'id2', 'id3'], name='id')))

        self.assertReallyNotEqual(dummy, numeric)
        self.assertReallyNotEqual(dummy, categorical)