示例#1
0
    def test_typical(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b'], name='id')))
        exp = [
            # sample a, fwd
            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
            # sample a, rev
            '@id1\nGGGGTGCATGCA\n+\nzzzzzzzzzzzz\n'
            '@id3\nGGGGTGCATGCA\n+\nzzzzzzzzzzzz\n',
            # sample b, fwd
            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
            # sample b, fwd
            '@id2\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
            '@id4\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
            '@id5\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n',
        ]
        exp_untrimmed = [
            '@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
            '@id6\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
        ]

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_paired_fn(self.muxed_sequences, metadata)

        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
        self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
示例#2
0
    def test_multiple_orientations_dual_indices(self):
        forward_barcodes = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC'],
                      name='ForwardBarcode',
                      index=pd.Index(['sample_a', 'sample_b'], name='id')))
        reverse_barcodes = CategoricalMetadataColumn(
            pd.Series(['GGGG', 'TTTT'],
                      name='ReverseBarcode',
                      index=pd.Index(['sample_a', 'sample_b'], name='id')))

        mixed_orientation_sequences_f_fp = self.get_data_path(
            'mixed-orientation/forward.fastq.gz')
        mixed_orientation_sequences_r_fp = self.get_data_path(
            'mixed-orientation/reverse.fastq.gz')

        # These files have forward and reverse reads mixed together in the same
        # file
        with tempfile.TemporaryDirectory() as temp:
            shutil.copy(mixed_orientation_sequences_f_fp, temp)
            shutil.copy(mixed_orientation_sequences_r_fp, temp)
            mixed_orientation_sequences = Artifact.import_data(
                'MultiplexedPairedEndBarcodeInSequence', temp)

        with self.assertRaisesRegex(
                ValueError, 'Dual-indexed barcodes for mixed '
                'orientation reads are not supported.'):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_paired_fn(mixed_orientation_sequences,
                                     forward_barcodes=forward_barcodes,
                                     reverse_barcodes=reverse_barcodes,
                                     mixed_orientation=True)
示例#3
0
    def test_batch_size(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b'], name='id')))
        exp = [
            # sample a
            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
            # sample b
            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
        ]

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata,
                                     batch_size=1)

        # This test should yield the same results as test_typical, above,
        # the fact that we are batching shouldn't impact the final results
        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
        self.assert_untrimmed_results('@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
                                      obs_untrimmed_art)
示例#4
0
    def test_batch_size_odd_number_of_samples(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC', 'GGGG'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
                                     name='id')))
        exp = [
            # sample a
            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
            # sample b
            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
            # sample c
            '@id6\nACGTACGT\n+\nzzzzzzzz\n',
        ]

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata,
                                     batch_size=2)

        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
        self.assert_untrimmed_results('', obs_untrimmed_art)
示例#5
0
def group(table: biom.Table, axis: str,
          metadata: qiime2.CategoricalMetadataColumn, mode: str) -> biom.Table:
    if table.is_empty():
        raise ValueError("Cannot group an empty table.")

    if axis == 'feature':
        biom_axis = 'observation'
    else:
        biom_axis = axis

    metadata = _munge_metadata_column(metadata, table.ids(axis=biom_axis),
                                      axis)

    grouped_table = table.collapse(
        lambda axis_id, _: metadata.get_value(axis_id),
        collapse_f=_mode_lookup[mode],
        axis=biom_axis,
        norm=False,
        include_collapsed_metadata=False)
    # Reorder axis by first unique appearance of each group value in metadata
    # (makes it stable for identity mappings and easier to test)
    # TODO use CategoricalMetadataColumn API for retrieving categories/groups,
    # when the API exists.
    series = metadata.to_series()
    return grouped_table.sort_order(series.unique(), axis=biom_axis)
示例#6
0
    def test_variable_length_barcodes(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['AAAAA', 'CCCCCC', 'GGGG'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
                                     name='id')))
        muxed_sequences_fp = self.get_data_path('variable_length.fastq.gz')
        muxed_sequences = Artifact.import_data(
            'MultiplexedSingleEndBarcodeInSequence', muxed_sequences_fp)
        exp = [
            # sample a
            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
            # sample b
            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
            # sample c
            '@id6\nACGTACGT\n+\nzzzzzzzz\n',
        ]

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(muxed_sequences, metadata)

        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
        self.assert_untrimmed_results('', obs_untrimmed_art)
示例#7
0
def heatmap(output_dir: str,
            ranks: pd.DataFrame,
            microbe_metadata: qiime2.CategoricalMetadataColumn = None,
            metabolite_metadata: qiime2.CategoricalMetadataColumn = None,
            method: str = 'average',
            metric: str = 'euclidean',
            color_palette: str = 'seismic',
            margin_palette: str = 'cubehelix',
            x_labels: bool = False,
            y_labels: bool = False,
            level: int = -1) -> None:
    if microbe_metadata is not None:
        microbe_metadata = microbe_metadata.to_series()
    if metabolite_metadata is not None:
        metabolite_metadata = metabolite_metadata.to_series()

    hotmap = ranks_heatmap(ranks, microbe_metadata, metabolite_metadata,
                           method, metric, color_palette, margin_palette,
                           x_labels, y_labels, level)

    hotmap.savefig(join(output_dir, 'heatmap.pdf'), bbox_inches='tight')
    hotmap.savefig(join(output_dir, 'heatmap.png'), bbox_inches='tight')

    index = join(TEMPLATES, 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'title': 'Rank Heatmap',
                           'pdf_fp': 'heatmap.pdf',
                           'png_fp': 'heatmap.png'
                       })
示例#8
0
def group(table: biom.Table, axis: str,
          metadata: qiime2.CategoricalMetadataColumn, mode: str) -> biom.Table:
    if table.is_empty():
        raise ValueError("Cannot group an empty table.")

    if axis == 'feature':
        biom_axis = 'observation'
    else:
        biom_axis = axis

    metadata = _munge_metadata_column(metadata, table.ids(axis=biom_axis),
                                      axis)

    grouped_table = table.collapse(
        lambda axis_id, _: metadata.get_value(axis_id),
        collapse_f=_mode_lookup[mode],
        axis=biom_axis,
        norm=False,
        include_collapsed_metadata=False)
    # Reorder axis by first unique appearance of each group value in metadata
    # (makes it stable for identity mappings and easier to test)
    # TODO use CategoricalMetadataColumn API for retrieving categories/groups,
    # when the API exists.
    series = metadata.to_series()
    return grouped_table.sort_order(series.unique(), axis=biom_axis)
示例#9
0
    def test_mixed_orientation_success(self):
        forward_barcodes = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC'],
                      name='ForwardBarcode',
                      index=pd.Index(['sample_a', 'sample_b'], name='id')))

        mixed_orientation_sequences_f_fp = self.get_data_path(
            'mixed-orientation/forward.fastq.gz')
        mixed_orientation_sequences_r_fp = self.get_data_path(
            'mixed-orientation/reverse.fastq.gz')

        with tempfile.TemporaryDirectory() as temp:
            shutil.copy(mixed_orientation_sequences_f_fp, temp)
            shutil.copy(mixed_orientation_sequences_r_fp, temp)
            mixed_orientation_sequences = Artifact.import_data(
                'MultiplexedPairedEndBarcodeInSequence', temp)

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_paired_fn(mixed_orientation_sequences,
                                     forward_barcodes=forward_barcodes,
                                     mixed_orientation=True)

        self.assert_demux_results(forward_barcodes.to_series(),
                                  obs_demuxed_art)
        # Everything should match
        self.assert_untrimmed_results([b'', b''], obs_untrimmed_art)
示例#10
0
    def test_min_length(self):
        metadata = CategoricalMetadataColumn(
            # The third barcode is meant to completely remove the only GGGG
            # coded sequence
            pd.Series(['AAAA', 'CCCC', 'GGGGACGTACGT'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
                                     name='id')))
        exp = [
            # sample a
            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
            # sample b
            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
            # sample c is empty because the barcode matched the entire
            # read, which removed everything.
            '',
        ]

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata)

        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
        self.assert_untrimmed_results('', obs_untrimmed_art)
示例#11
0
def subsample_longitudinal(dates: qiime2.CategoricalMetadataColumn,
                           start_date: str = None,
                           samples_per_interval: int = 7,
                           days_per_interval: int = 7,
                           seed: int = None) -> IDSelection:

    window_size = '%dD' % days_per_interval

    dt_series = pd.to_datetime(dates.to_series(), errors='coerce')
    df = pd.DataFrame({'ids': dates.to_series().index}, index=dt_series)

    if start_date is not None:
        filter_before = pd.Timestamp(start_date)
        df = df.iloc[np.where(dt_series >= filter_before)]
        if filter_before not in df.index:
            # this will be stripped in _sample_group::_sampler
            # the purpose is to force Pandas to begin the window at this
            # time instead of the first observation (by making NaN the first
            # observation)
            df.loc[filter_before] = float('nan')

    grouped = df.groupby(pd.Grouper(freq=window_size,
                                    convention='start',
                                    closed='left'),
                         group_keys=False)
    filtered_df = grouped.apply(_sample_group(samples_per_interval, seed))

    df = df.dropna(axis=0)
    selection = pd.Series(False, index=dates.to_series().index)
    selection[filtered_df['ids']] = True

    md = qiime2.Metadata(dates.to_dataframe())
    return IDSelection(selection, md, 'subsample_longitudinal')
示例#12
0
    def test_mixed_orientation_success(self):
        # sample_a and sample_b have reads in both fwd and rev directions.
        # sample_c only has reads in the fwd direction.
        # sample_d only has reads in the rev direction.
        forward_barcodes = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC', 'GGGG', 'TTTT'],
                      name='ForwardBarcode',
                      index=pd.Index(
                          ['sample_a', 'sample_b', 'sample_c', 'sample_d'],
                          name='id')))
        mixed_orientation_sequences_f_fp = self.get_data_path(
            'mixed-orientation/forward.fastq.gz')
        mixed_orientation_sequences_r_fp = self.get_data_path(
            'mixed-orientation/reverse.fastq.gz')
        with tempfile.TemporaryDirectory() as temp:
            shutil.copy(mixed_orientation_sequences_f_fp, temp)
            shutil.copy(mixed_orientation_sequences_r_fp, temp)
            mixed_orientation_sequences = Artifact.import_data(
                'MultiplexedPairedEndBarcodeInSequence', temp)

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_paired_fn(mixed_orientation_sequences,
                                     forward_barcodes=forward_barcodes,
                                     mixed_orientation=True)
        exp = [
            # sample_a fwd
            '@id1\nACGTACGT\n+\nyyyyyyyy\n' \
            '@id3\nACGTACGT\n+\nyyyyyyyy\n',
            # sample_a rev
            '@id1\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n' \
            '@id3\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n',
            # sample_b fwd
            '@id4\nACGTACGT\n+\nyyyyyyyy\n' \
            '@id2\nACGTACGT\n+\nyyyyyyyy\n',
            # sample_b rev
            '@id4\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n' \
            '@id2\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n',
            # sample_c fwd
            '@id5\nACGTACGT\n+\nyyyyyyyy\n',
            # sample_c rev
            '@id5\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n',
            # sample_d fwd
            '@id6\nACGTACGT\n+\nyyyyyyyy\n',
            # sample_d rev
            '@id6\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n', ]

        # We want to be sure that the validation is 100%, not just `min`,
        obs_demuxed_art.validate(level='max')
        # checkpoint assertion for the above `validate` - nothing should fail
        self.assertTrue(True)

        self.assert_demux_results(forward_barcodes.to_series(), exp,
                                  obs_demuxed_art)

        # Everything should match, so untrimmed should be empty
        self.assert_untrimmed_results(['', ''], obs_untrimmed_art)
示例#13
0
    def test_typical(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b'], name='id')))

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata)

        self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
        self.assert_untrimmed_results(b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
                                      obs_untrimmed_art)
示例#14
0
    def test_all_matched(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC', 'GGGG'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
                                     name='id')))

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata)

        self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
        # obs_untrimmed should be empty, since everything matched
        self.assert_untrimmed_results(b'', obs_untrimmed_art)
示例#15
0
    def setUp(self):
        self.results = "results"
        if not os.path.exists(self.results):
            os.mkdir(self.results)
        self.balances = pd.DataFrame(
            {
                'a': [-2, -1, 0, 1, 2],
                'b': [-2, 0, 0, 0, 0]
            },
            index=['a1', 'a2', 'a3', 'a4', 'a5'])
        self.tree = TreeNode.read([r'((k, q)d, ((x, y)a, z)b)c;'])

        self.taxonomy = pd.DataFrame(
            [['foo;barf;a;b;c;d;e', 1], ['foo;bark;f;g;h;i;j', 1],
             ['foo;bark;f;g;h;w;j', 1], ['nom;tu;k;l;m;n;o', 0.9],
             ['nom;tu;k;l;m;t;o', 0.9]],
            columns=['Taxon', 'Confidence'],
            index=['x', 'y', 'z', 'k', 'q'])

        self.balances = pd.DataFrame(
            [[1, 2, 3, 4, 5, 6, 7], [-3.1, -2.9, -3, 3, 2.9, 3.2, 3.1],
             [1, 1, 1, 1, 1, 1, 1], [3, 2, 1, 0, -1, -2, -3]],
            index=['d', 'a', 'b', 'c'],
            columns=['s1', 's2', 's3', 's4', 's5', 's6', 's7']).T
        basis, _ = balance_basis(self.tree)
        self.table = pd.DataFrame(
            ilr_inv(self.balances, basis),
            columns=['x', 'y', 'z', 'k', 'q'],
            index=['s1', 's2', 's3', 's4', 's5', 's6', 's7'])

        index = pd.Index(['s1', 's2', 's3', 's4', 's5', 's6', 's7'], name='id')
        self.categorical = CategoricalMetadataColumn(
            pd.Series(['a', 'a', 'a', 'b', 'b', 'b', 'b'],
                      index=index,
                      name='categorical'))
        self.multi_categorical = CategoricalMetadataColumn(
            pd.Series(['a', 'a', 'c', 'b', 'b', 'b', 'c'],
                      index=index,
                      name='multi_categorical'))
        self.partial_numerical_categorical = CategoricalMetadataColumn(
            pd.Series(['1', '1', '1', '2', '2', '2', 'a'],
                      index=index,
                      name='multi_categorical'))
        self.full_numerical_categorical = CategoricalMetadataColumn(
            pd.Series(['1', '1', '1.0', '2', '2', '2.0', '3'],
                      index=index,
                      name='numerical_categorical'))
        self.continuous = NumericMetadataColumn(
            pd.Series(np.arange(7), index=index, name='continuous'))
示例#16
0
    def test_error_tolerance_high_enough_to_prevent_filtering(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['AAAG', 'CCCC'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b'], name='id')))

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata,
                                     error_rate=0.25)

        # This test should yield the same results as test_typical, above
        self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
        self.assert_untrimmed_results(b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
                                      obs_untrimmed_art)
示例#17
0
    def test_di_mismatched_barcodes(self):
        forward_barcodes = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC', 'ACGT'],
                      name='ForwardBarcode',
                      index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
                                     name='id')))
        reverse_barcodes = CategoricalMetadataColumn(
            pd.Series(['GGGG', 'TTTT'],
                      name='ReverseBarcode',
                      index=pd.Index(['sample_a', 'sample_b'], name='id')))

        with self.assertRaisesRegex(ValueError, 'do not have.*sample_c'):
            self.demux_paired_fn(self.muxed_sequences,
                                 forward_barcodes=forward_barcodes,
                                 reverse_barcodes=reverse_barcodes)
示例#18
0
    def test_heatmap_extra_tips(self):
        # Adds in test scenario where there more tips than features
        # in the table
        np.random.seed(0)
        num_otus = 11  # otus
        index = pd.Index(np.arange(5).astype(np.str), name='id')
        table = pd.DataFrame(np.random.random((len(index), num_otus)),
                             index=index,
                             columns=np.arange(num_otus).astype(np.str))

        x = np.random.rand(num_otus * 2)
        dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x - y))
        lm = ward(dm.condensed_form())
        t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str))

        for i, n in enumerate(t.postorder()):
            if not n.is_tip():
                n.name = "y%d" % i
            n.length = np.random.rand() * 3

        md = CategoricalMetadataColumn(
            pd.Series(['a', 'a', 'a', 'b', 'b'],
                      index=index,
                      name='column-name'))

        dendrogram_heatmap(self.results, table, t, md)

        index_fp = os.path.join(self.results, 'index.html')
        self.assertTrue(os.path.exists(index_fp))

        with open(index_fp, 'r') as fh:
            html = fh.read()
            self.assertIn('<h1>Dendrogram heatmap</h1>', html)
示例#19
0
    def test_visualization_garbage_metadata(self):
        # tests the scenario where ndim > number of tips
        np.random.seed(0)
        num_otus = 10  # otus
        num_samples = 5
        table = pd.DataFrame(np.random.random((num_samples, num_otus)),
                             index=np.arange(num_samples).astype(np.str),
                             columns=np.arange(num_otus).astype(np.str))

        x = np.random.rand(num_otus)
        dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x - y))
        lm = ward(dm.condensed_form())
        t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str))

        for i, n in enumerate(t.postorder()):
            if not n.is_tip():
                n.name = "y%d" % i
            n.length = np.random.rand() * 3

        md = CategoricalMetadataColumn(
            pd.Series(['a', 'a', 'a', 'b', 'b', 'foo', 'foo'],
                      index=pd.Index(np.arange(7).astype(np.str), name='id'),
                      name='column-name'))

        dendrogram_heatmap(self.results, table, t, md)

        index_fp = os.path.join(self.results, 'index.html')
        self.assertTrue(os.path.exists(index_fp))

        with open(index_fp, 'r') as fh:
            html = fh.read()
            self.assertIn('<h1>Dendrogram heatmap</h1>', html)
示例#20
0
    def test_extra_barcode_in_metadata(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC', 'GGGG', 'TTTT'],
                      name='Barcode',
                      index=pd.Index(
                          ['sample_a', 'sample_b', 'sample_c', 'sample_d'],
                          name='id')))
        exp = [
            # sample a
            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
            # sample b
            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
            # sample c
            '@id6\nACGTACGT\n+\nzzzzzzzz\n',
            # sample d is empty bc no reads matched the barcode TTTT
            '',
        ]

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata)

        exp_samples_and_barcodes = pd.Series(
            ['AAAA', 'CCCC', 'GGGG', 'TTTT'],
            index=['sample_a', 'sample_b', 'sample_c', 'sample_d'])
        self.assert_demux_results(exp_samples_and_barcodes, exp,
                                  obs_demuxed_art)
        self.assert_untrimmed_results('', obs_untrimmed_art)
示例#21
0
 def setUp(self):
     _ranks = pd.DataFrame([[4.1, 1.3, 2.1], [0.1, 0.3, 0.2],
                            [2.2, 4.3, 3.2], [-6.3, -4.4, 2.1]],
                           index=pd.Index([c for c in 'ABCD'], name='id'),
                           columns=['m1', 'm2', 'm3'])
     self.ranks = Artifact.import_data('FeatureData[Conditional]', _ranks)
     self.taxa = CategoricalMetadataColumn(
         pd.Series([
             'k__Bacteria; p__Proteobacteria; c__Deltaproteobacteria; '
             'o__Desulfobacterales; f__Desulfobulbaceae; g__; s__',
             'k__Bacteria; p__Cyanobacteria; c__Chloroplast; o__Streptophyta',
             'k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; '
             'o__Rickettsiales; f__mitochondria; g__Lardizabala; s__biternata',
             'k__Archaea; p__Euryarchaeota; c__Methanomicrobia; '
             'o__Methanosarcinales; f__Methanosarcinaceae; g__Methanosarcina'
         ],
                   index=pd.Index([c for c in 'ABCD'], name='feature-id'),
                   name='Taxon'))
     metabolites = biom.Table(np.array([[9, 8, 2], [2, 1, 2], [9, 4, 5],
                                        [8, 8, 7]]),
                              sample_ids=['s1', 's2', 's3'],
                              observation_ids=['m1', 'm2', 'm3', 'm4'])
     self.metabolites = Artifact.import_data('FeatureTable[Frequency]',
                                             metabolites)
     microbes = biom.Table(np.array([[1, 2, 3], [3, 6, 3], [1, 9, 9],
                                     [8, 8, 7]]),
                           sample_ids=['s1', 's2', 's3'],
                           observation_ids=[i for i in 'ABCD'])
     self.microbes = Artifact.import_data('FeatureTable[Frequency]',
                                          microbes)
示例#22
0
    def test_invalid_batch_size(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b'], name='id')))

        with self.assertRaisesRegex(ValueError, '5.*cannot be greater.*2'):
            self.demux_single_fn(self.muxed_sequences, metadata, batch_size=5)
示例#23
0
def classify_samples(output_dir: str,
                     table: pd.DataFrame,
                     metadata: qiime2.CategoricalMetadataColumn,
                     test_size: float = defaults['test_size'],
                     step: float = defaults['step'],
                     cv: int = defaults['cv'],
                     random_state: int = None,
                     n_jobs: int = defaults['n_jobs'],
                     n_estimators: int = defaults['n_estimators'],
                     estimator: str = defaults['estimator_r'],
                     optimize_feature_selection: bool = False,
                     parameter_tuning: bool = False,
                     palette: str = defaults['palette']) -> None:

    # extract column name from CategoricalMetadataColumn
    column = metadata.to_series().name

    # disable feature selection for unsupported estimators
    optimize_feature_selection, calc_feature_importance = \
        _disable_feature_selection(estimator, optimize_feature_selection)

    # specify parameters and distributions to sample from for parameter tuning
    estimator, param_dist, parameter_tuning = _set_parameters_and_estimator(
        estimator,
        table,
        metadata,
        column,
        n_estimators,
        n_jobs,
        cv,
        random_state,
        parameter_tuning,
        classification=True)

    estimator, cm, accuracy, importances = split_optimize_classify(
        table,
        metadata,
        column,
        estimator,
        output_dir,
        test_size=test_size,
        step=step,
        cv=cv,
        random_state=random_state,
        n_jobs=n_jobs,
        optimize_feature_selection=optimize_feature_selection,
        parameter_tuning=parameter_tuning,
        param_dist=param_dist,
        calc_feature_importance=calc_feature_importance,
        palette=palette)

    _visualize(output_dir,
               estimator,
               cm,
               accuracy,
               importances,
               optimize_feature_selection,
               title='classification predictions')
示例#24
0
    def test_none_matched(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['TTTT'],
                      name='Barcode',
                      index=pd.Index(['sample_d'], name='id')))

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata)

        self.assert_demux_results(metadata.to_series(), [''], obs_demuxed_art)
        self.assert_untrimmed_results(
            '@id1\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n'
            '@id2\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n'
            '@id3\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n'
            '@id4\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n'
            '@id5\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n'
            '@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n', obs_untrimmed_art)
示例#25
0
    def test_none_matched(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['TTTT'],
                      name='Barcode',
                      index=pd.Index(['sample_d'], name='id')))

        with redirected_stdio(stderr=os.devnull):
            with self.assertRaisesRegex(ValueError, 'demultiplexed'):
                self.demux_single_fn(self.muxed_sequences, metadata)
示例#26
0
    def test_variable_length_barcodes(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['AAAAA', 'CCCCCC', 'GGGG'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
                                     name='id')))
        muxed_sequences_fp = self.get_data_path('variable_length.fastq.gz')
        muxed_sequences = Artifact.import_data(
            'MultiplexedSingleEndBarcodeInSequence', muxed_sequences_fp)

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(muxed_sequences, metadata)

        # This test should yield the same results as test_typical, above, just
        # with variable length barcodes
        self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
        self.assert_untrimmed_results(b'', obs_untrimmed_art)
示例#27
0
    def setUp(self):
        barcode_map = pd.Series(['GTCA', 'TCAG', 'GGGG'],
                                index=['sample1', 'sample2', 'sample3'],
                                name="aname")
        barcode_map.index.name = "sample_name"
        barcode_map = CategoricalMetadataColumn(barcode_map)

        seqs_fp = dir_path + "/data/small/"

        seqs = Artifact.import_data("EMPSingleEndSequences", seqs_fp)

        self.demuxed, = emp_single(seqs, barcode_map)
        self.exp = 1
示例#28
0
 def setUp(self):
     _ranks = pd.DataFrame([[4.1, 1.3, 2.1], [0.1, 0.3, 0.2],
                            [2.2, 4.3, 3.2], [-6.3, -4.4, 2.1]],
                           index=pd.Index([c for c in 'ABCD'], name='id'),
                           columns=['m1', 'm2', 'm3'])
     self.ranks = Artifact.import_data('FeatureData[Conditional]', _ranks)
     self.taxa = CategoricalMetadataColumn(
         pd.Series([
             'k__Bacteria; p__Proteobacteria; c__Deltaproteobacteria; '
             'o__Desulfobacterales; f__Desulfobulbaceae; g__; s__',
             'k__Bacteria; p__Cyanobacteria; c__Chloroplast; o__Streptophyta',
             'k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; '
             'o__Rickettsiales; f__mitochondria; g__Lardizabala; s__biternata',
             'k__Archaea; p__Euryarchaeota; c__Methanomicrobia; '
             'o__Methanosarcinales; f__Methanosarcinaceae; g__Methanosarcina'
         ],
                   index=pd.Index([c for c in 'ABCD'], name='feature-id'),
                   name='Taxon'))
     self.metabolites = CategoricalMetadataColumn(
         pd.Series(['amino acid', 'carbohydrate', 'drug metabolism'],
                   index=pd.Index(['m1', 'm2', 'm3'], name='feature-id'),
                   name='Super Pathway'))
示例#29
0
def paired_heatmap(output_dir: str,
                   ranks: pd.DataFrame,
                   microbes_table: biom.Table,
                   metabolites_table: biom.Table,
                   features: str = None,
                   top_k_microbes: int = 2,
                   keep_top_samples: bool = True,
                   microbe_metadata: qiime2.CategoricalMetadataColumn = None,
                   normalize: str = 'log10',
                   color_palette: str = 'magma',
                   top_k_metabolites: int = 50,
                   level: int = -1,
                   row_center: bool = True) -> None:
    if microbe_metadata is not None:
        microbe_metadata = microbe_metadata.to_series()

    ranks = ranks.T

    if row_center:
        ranks = ranks - ranks.mean(axis=0)

    select_microbes, select_metabolites, hotmaps = paired_heatmaps(
        ranks, microbes_table, metabolites_table, microbe_metadata, features,
        top_k_microbes, top_k_metabolites, keep_top_samples, level, normalize,
        color_palette)

    hotmaps.savefig(join(output_dir, 'heatmap.pdf'), bbox_inches='tight')
    hotmaps.savefig(join(output_dir, 'heatmap.png'), bbox_inches='tight')
    select_microbes.to_csv(join(output_dir, 'select_microbes.tsv'), sep='\t')
    select_metabolites.to_csv(join(output_dir, 'select_metabolites.tsv'),
                              sep='\t')

    index = join(TEMPLATES, 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'title':
                           'Paired Feature Abundance Heatmaps',
                           'pdf_fp':
                           'heatmap.pdf',
                           'png_fp':
                           'heatmap.png',
                           'table1_fp':
                           'select_microbes.tsv',
                           'download1_text':
                           'Download microbe abundances as TSV',
                           'table2_fp':
                           'select_metabolites.tsv',
                           'download2_text':
                           'Download top k metabolite abundances as TSV'
                       })
示例#30
0
def aldex2(table: pd.DataFrame,
           metadata: qiime2.CategoricalMetadataColumn,
           mc_samples: int = 128,
           test: str = 't',
           denom: str = 'all') -> pd.DataFrame:

    # create series from the metadata column
    meta = metadata.to_series()

    # The condition is just the only column in the passed metadata column
    condition = metadata.name

    # filter the metadata so only the samples present in the table are used
    # this also reorders it for the correct condition selection
    # it has to be re ordered for aldex to correctly input the conditions
    meta = meta.loc[list(table.index)]

    # force reorder based on the data to ensure conds are selected correctly

    with tempfile.TemporaryDirectory() as temp_dir_name:
        biom_fp = os.path.join(temp_dir_name, 'input.tsv.biom')
        map_fp = os.path.join(temp_dir_name, 'input.map.txt')
        summary_fp = os.path.join(temp_dir_name, 'output.summary.txt')

        # Need to manually specify header=True for Series (i.e. "meta"). It's
        # already the default for DataFrames (i.e. "table"), but we manually
        # specify it here anyway to alleviate any potential confusion.
        table.to_csv(biom_fp, sep='\t', header=True)
        meta.to_csv(map_fp, sep='\t', header=True)

        cmd = [
            'run_aldex2.R', biom_fp, map_fp, condition, mc_samples, test,
            denom, summary_fp
        ]
        cmd = list(map(str, cmd))

        try:
            run_commands([cmd])
        except subprocess.CalledProcessError as e:
            raise Exception("An error was encountered while running ALDEx2"
                            " in R (return code %d), please inspect stdout"
                            " and stderr to learn more." % e.returncode)

        summary = pd.read_csv(summary_fp, index_col=0)
        #differentials = summary[['effect']]
        # hack to fix column name for features because aldex removes
        #it in R because of row.names = 1

        summary.index.name = "featureid"
        summary.rename(index=str, inplace=True)
        return summary
示例#31
0
    def test_di_typical(self):
        forward_barcodes = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC'],
                      name='ForwardBarcode',
                      index=pd.Index(['sample_a', 'sample_b'], name='id')))
        reverse_barcodes = CategoricalMetadataColumn(
            pd.Series(['GGGG', 'TTTT'],
                      name='ReverseBarcode',
                      index=pd.Index(['sample_a', 'sample_b'], name='id')))

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_paired_fn(self.muxed_sequences,
                                     forward_barcodes=forward_barcodes,
                                     reverse_barcodes=reverse_barcodes)

        self.assert_demux_results(forward_barcodes.to_series(),
                                  obs_demuxed_art)
        exp_untrimmed = [
            b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
            b'@id6\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
        ]
        self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
示例#32
0
def beta_group_significance(output_dir: str,
                            distance_matrix: skbio.DistanceMatrix,
                            metadata: qiime2.CategoricalMetadataColumn,
                            method: str = 'permanova',
                            pairwise: bool = False,
                            permutations: int = 999) -> None:
    try:
        beta_group_significance_fn = _beta_group_significance_fns[method]
    except KeyError:
        raise ValueError('Unknown group significance method %s. The available '
                         'options are %s.' %
                         (method,
                          ', '.join(_beta_group_significance_fns)))

    # Filter metadata to only include IDs present in the distance matrix.
    # Also ensures every distance matrix ID is present in the metadata.
    metadata = metadata.filter_ids(distance_matrix.ids)
    metadata = metadata.drop_missing_values()

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata due to missing values, and keep track of how many samples
    # survived the filtering so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(metadata.ids)
    filtered_dm_length = distance_matrix.shape[0]

    metadata = metadata.to_series()

    # Run the significance test
    result = beta_group_significance_fn(distance_matrix, metadata,
                                        permutations=permutations)

    # Generate distance boxplots
    sns.set_style('white')
    # Identify the groups, then compute the within group distances and the
    # between group distances, and generate one boxplot per group.
    # groups will be an OrderedDict mapping group id to the sample ids in that
    # group. The order is used both on the x-axis, and in the layout of the
    # boxplots in the visualization.
    # TODO: update to use a grouping API and natsort API on
    # CategoricalMetadataColumn, if those become available.
    groupings = collections.OrderedDict(
        [(id, list(series.index))
         for id, series in natsorted(metadata.groupby(metadata))])

    pairs_summary = pd.DataFrame(columns=['SubjectID1', 'SubjectID2', 'Group1',
                                          'Group2', 'Distance'])
    for group_id in groupings:
        group_distances, x_ticklabels, group_pairs_summary = \
            _get_distance_boxplot_data(distance_matrix, group_id, groupings)

        group_pairs_summary = pd.DataFrame(
            group_pairs_summary, columns=['SubjectID1', 'SubjectID2',
                                          'Group1', 'Group2', 'Distance'])

        pairs_summary = pd.concat([pairs_summary, group_pairs_summary])

        ax = sns.boxplot(data=group_distances, flierprops={
            'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5,
            'alpha': 0.5})
        ax.set_xticklabels(x_ticklabels, rotation=90)
        ax.set_xlabel('Group')
        ax.set_ylabel('Distance')
        ax.set_title('Distances to %s' % group_id)
        # change the color of the boxes to white
        for box in ax.artists:
            box.set_facecolor('white')
        sns.despine()
        plt.tight_layout()
        fig = ax.get_figure()
        fig.savefig(os.path.join(output_dir, '%s-boxplots.png' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.clear()

    pairs_summary.to_csv(os.path.join(output_dir, 'raw_data.tsv'), sep='\t')

    result_html = q2templates.df_to_html(result.to_frame())

    if pairwise:
        pairwise_results = []
        for group1_id, group2_id in itertools.combinations(groupings, 2):
            pairwise_result = \
                _get_pairwise_group_significance_stats(
                    distance_matrix=distance_matrix,
                    group1_id=group1_id,
                    group2_id=group2_id,
                    groupings=groupings,
                    metadata=metadata,
                    beta_group_significance_fn=beta_group_significance_fn,
                    permutations=permutations)
            pairwise_results.append([group1_id,
                                     group2_id,
                                     pairwise_result['sample size'],
                                     permutations,
                                     pairwise_result['test statistic'],
                                     pairwise_result['p-value']])
        columns = ['Group 1', 'Group 2', 'Sample size', 'Permutations',
                   result['test statistic name'], 'p-value']
        pairwise_results = pd.DataFrame(pairwise_results, columns=columns)
        pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True)
        pairwise_results['q-value'] = multipletests(
            pairwise_results['p-value'], method='fdr_bh')[1]
        pairwise_results.sort_index(inplace=True)
        pairwise_path = os.path.join(
            output_dir, '%s-pairwise.csv' % method)
        pairwise_results.to_csv(pairwise_path)

        pairwise_results_html = q2templates.df_to_html(pairwise_results)
    else:
        pairwise_results_html = None

    # repartition groupings for rendering
    group_ids = list(groupings.keys())
    row_count, group_count = 3, len(group_ids)  # Start at three plots per row
    while group_count % row_count != 0:
        row_count = row_count - 1

    group_rows = [group_ids[g:g+row_count] for g in range(0, group_count,
                                                          row_count)]

    index = os.path.join(
        TEMPLATES, 'beta_group_significance_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'method': method,
        'group_rows': group_rows,
        'bootstrap_group_col_size': int(12 / row_count),
        'result': result_html,
        'pairwise_results': pairwise_results_html
    })
示例#33
0
def ancom(output_dir: str,
          table: pd.DataFrame,
          metadata: qiime2.CategoricalMetadataColumn,
          transform_function: str = 'clr',
          difference_function: str = None) -> None:
    metadata = metadata.filter_ids(table.index)
    if metadata.has_missing_values():
        missing_data_sids = metadata.get_ids(where_values_missing=True)
        missing_data_sids = ', '.join(sorted(missing_data_sids))
        raise ValueError('Metadata column is missing values for the '
                         'following samples. Values need to be added for '
                         'these samples, or the samples need to be removed '
                         'from the table: %s' % missing_data_sids)
    ancom_results = skbio_ancom(table,
                                metadata.to_series(),
                                significance_test=f_oneway)
    ancom_results[0].sort_values(by='W', ascending=False, inplace=True)
    ancom_results[0].rename(columns={'reject': 'Reject null hypothesis'},
                            inplace=True)
    significant_features = ancom_results[0][
        ancom_results[0]['Reject null hypothesis']]

    context = dict()
    if not significant_features.empty:
        context['significant_features'] = q2templates.df_to_html(
            significant_features['W'].to_frame())
        context['percent_abundances'] = q2templates.df_to_html(
            ancom_results[1].loc[significant_features.index])

    metadata = metadata.to_series()
    cats = list(set(metadata))
    transform_function_name = transform_function
    transform_function = _transform_functions[transform_function]
    transformed_table = table.apply(
        transform_function, axis=1, result_type='broadcast')

    if difference_function is None:
        if len(cats) == 2:
            difference_function = 'mean_difference'
        else:  # len(categories) > 2
            difference_function = 'f_statistic'

    _d_func = _difference_functions[difference_function]

    def diff_func(x):
        args = _d_func(*[x[metadata == c] for c in cats])
        if isinstance(args, tuple):
            return args[0]
        else:
            return args
    # effectively doing a groupby operation wrt to the metadata
    fold_change = transformed_table.apply(diff_func, axis=0)
    if not pd.isnull(fold_change).all():
        volcano_results = pd.DataFrame({transform_function_name: fold_change,
                                        'W': ancom_results[0].W})
        volcano_results = volcano_results.reset_index(drop=False)

        spec = {
            '$schema': 'https://vega.github.io/schema/vega/v4.json',
            'width': 300,
            'height': 300,
            'data': [
                {'name': 'values',
                 'values': volcano_results.to_dict(orient='records')}],
            'scales': [
                {'name': 'xScale',
                 'domain': {'data': 'values',
                            'field': transform_function_name},
                 'range': 'width'},
                {'name': 'yScale',
                 'domain': {'data': 'values', 'field': 'W'},
                 'range': 'height'}],
            'axes': [
                {'scale': 'xScale', 'orient': 'bottom',
                 'title': transform_function_name},
                {'scale': 'yScale', 'orient': 'left', 'title': 'W'}],
            'marks': [
              {'type': 'symbol',
               'from': {'data': 'values'},
               'encode': {
                   'hover': {
                       'fill': {'value': '#FF0000'},
                       'opacity': {'value': 1}},
                   'enter': {
                       'x': {'scale': 'xScale',
                             'field': transform_function_name},
                       'y': {'scale': 'yScale', 'field': 'W'}},
                   'update': {
                       'fill': {'value': 'black'},
                       'opacity': {'value': 0.3},
                       'tooltip': {
                           'signal': "{{'title': datum['index'], '{0}': "
                                     "datum['{0}'], 'W': datum['W']}}".format(
                                         transform_function_name)}}}}]}
        context['vega_spec'] = json.dumps(spec)

    copy_tree(os.path.join(TEMPLATES, 'ancom'), output_dir)
    ancom_results[0].to_csv(os.path.join(output_dir, 'ancom.tsv'),
                            header=True, index=True, sep='\t')
    ancom_results[1].to_csv(os.path.join(output_dir,
                                         'percent-abundances.tsv'),
                            header=True, index=True, sep='\t')
    index = os.path.join(TEMPLATES, 'ancom', 'index.html')
    q2templates.render(index, output_dir, context=context)