Пример #1
0
def heatmap(output_dir: str,
            ranks: pd.DataFrame,
            microbe_metadata: qiime2.CategoricalMetadataColumn = None,
            metabolite_metadata: qiime2.CategoricalMetadataColumn = None,
            method: str = 'average',
            metric: str = 'euclidean',
            color_palette: str = 'seismic',
            margin_palette: str = 'cubehelix',
            x_labels: bool = False,
            y_labels: bool = False,
            level: int = -1) -> None:
    if microbe_metadata is not None:
        microbe_metadata = microbe_metadata.to_series()
    if metabolite_metadata is not None:
        metabolite_metadata = metabolite_metadata.to_series()

    hotmap = ranks_heatmap(ranks, microbe_metadata, metabolite_metadata,
                           method, metric, color_palette, margin_palette,
                           x_labels, y_labels, level)

    hotmap.savefig(join(output_dir, 'heatmap.pdf'), bbox_inches='tight')
    hotmap.savefig(join(output_dir, 'heatmap.png'), bbox_inches='tight')

    index = join(TEMPLATES, 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'title': 'Rank Heatmap',
                           'pdf_fp': 'heatmap.pdf',
                           'png_fp': 'heatmap.png'
                       })
Пример #2
0
def subsample_longitudinal(dates: qiime2.CategoricalMetadataColumn,
                           start_date: str = None,
                           samples_per_interval: int = 7,
                           days_per_interval: int = 7,
                           seed: int = None) -> IDSelection:

    window_size = '%dD' % days_per_interval

    dt_series = pd.to_datetime(dates.to_series(), errors='coerce')
    df = pd.DataFrame({'ids': dates.to_series().index}, index=dt_series)

    if start_date is not None:
        filter_before = pd.Timestamp(start_date)
        df = df.iloc[np.where(dt_series >= filter_before)]
        if filter_before not in df.index:
            # this will be stripped in _sample_group::_sampler
            # the purpose is to force Pandas to begin the window at this
            # time instead of the first observation (by making NaN the first
            # observation)
            df.loc[filter_before] = float('nan')

    grouped = df.groupby(pd.Grouper(freq=window_size,
                                    convention='start',
                                    closed='left'),
                         group_keys=False)
    filtered_df = grouped.apply(_sample_group(samples_per_interval, seed))

    df = df.dropna(axis=0)
    selection = pd.Series(False, index=dates.to_series().index)
    selection[filtered_df['ids']] = True

    md = qiime2.Metadata(dates.to_dataframe())
    return IDSelection(selection, md, 'subsample_longitudinal')
Пример #3
0
def group(table: biom.Table, axis: str,
          metadata: qiime2.CategoricalMetadataColumn, mode: str) -> biom.Table:
    if table.is_empty():
        raise ValueError("Cannot group an empty table.")

    if axis == 'feature':
        biom_axis = 'observation'
    else:
        biom_axis = axis

    metadata = _munge_metadata_column(metadata, table.ids(axis=biom_axis),
                                      axis)

    grouped_table = table.collapse(
        lambda axis_id, _: metadata.get_value(axis_id),
        collapse_f=_mode_lookup[mode],
        axis=biom_axis,
        norm=False,
        include_collapsed_metadata=False)
    # Reorder axis by first unique appearance of each group value in metadata
    # (makes it stable for identity mappings and easier to test)
    # TODO use CategoricalMetadataColumn API for retrieving categories/groups,
    # when the API exists.
    series = metadata.to_series()
    return grouped_table.sort_order(series.unique(), axis=biom_axis)
Пример #4
0
    def test_typical(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b'], name='id')))
        exp = [
            # sample a, fwd
            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
            # sample a, rev
            '@id1\nGGGGTGCATGCA\n+\nzzzzzzzzzzzz\n'
            '@id3\nGGGGTGCATGCA\n+\nzzzzzzzzzzzz\n',
            # sample b, fwd
            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
            # sample b, fwd
            '@id2\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
            '@id4\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
            '@id5\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n',
        ]
        exp_untrimmed = [
            '@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
            '@id6\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
        ]

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_paired_fn(self.muxed_sequences, metadata)

        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
        self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
Пример #5
0
    def test_mixed_orientation_success(self):
        forward_barcodes = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC'],
                      name='ForwardBarcode',
                      index=pd.Index(['sample_a', 'sample_b'], name='id')))

        mixed_orientation_sequences_f_fp = self.get_data_path(
            'mixed-orientation/forward.fastq.gz')
        mixed_orientation_sequences_r_fp = self.get_data_path(
            'mixed-orientation/reverse.fastq.gz')

        with tempfile.TemporaryDirectory() as temp:
            shutil.copy(mixed_orientation_sequences_f_fp, temp)
            shutil.copy(mixed_orientation_sequences_r_fp, temp)
            mixed_orientation_sequences = Artifact.import_data(
                'MultiplexedPairedEndBarcodeInSequence', temp)

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_paired_fn(mixed_orientation_sequences,
                                     forward_barcodes=forward_barcodes,
                                     mixed_orientation=True)

        self.assert_demux_results(forward_barcodes.to_series(),
                                  obs_demuxed_art)
        # Everything should match
        self.assert_untrimmed_results([b'', b''], obs_untrimmed_art)
Пример #6
0
    def test_batch_size_odd_number_of_samples(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC', 'GGGG'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
                                     name='id')))
        exp = [
            # sample a
            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
            # sample b
            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
            # sample c
            '@id6\nACGTACGT\n+\nzzzzzzzz\n',
        ]

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata,
                                     batch_size=2)

        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
        self.assert_untrimmed_results('', obs_untrimmed_art)
Пример #7
0
    def test_min_length(self):
        metadata = CategoricalMetadataColumn(
            # The third barcode is meant to completely remove the only GGGG
            # coded sequence
            pd.Series(['AAAA', 'CCCC', 'GGGGACGTACGT'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
                                     name='id')))
        exp = [
            # sample a
            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
            # sample b
            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
            # sample c is empty because the barcode matched the entire
            # read, which removed everything.
            '',
        ]

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata)

        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
        self.assert_untrimmed_results('', obs_untrimmed_art)
Пример #8
0
    def test_batch_size(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b'], name='id')))
        exp = [
            # sample a
            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
            # sample b
            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
        ]

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata,
                                     batch_size=1)

        # This test should yield the same results as test_typical, above,
        # the fact that we are batching shouldn't impact the final results
        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
        self.assert_untrimmed_results('@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
                                      obs_untrimmed_art)
Пример #9
0
    def test_variable_length_barcodes(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['AAAAA', 'CCCCCC', 'GGGG'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
                                     name='id')))
        muxed_sequences_fp = self.get_data_path('variable_length.fastq.gz')
        muxed_sequences = Artifact.import_data(
            'MultiplexedSingleEndBarcodeInSequence', muxed_sequences_fp)
        exp = [
            # sample a
            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
            # sample b
            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
            # sample c
            '@id6\nACGTACGT\n+\nzzzzzzzz\n',
        ]

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(muxed_sequences, metadata)

        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
        self.assert_untrimmed_results('', obs_untrimmed_art)
Пример #10
0
def group(table: biom.Table, axis: str,
          metadata: qiime2.CategoricalMetadataColumn, mode: str) -> biom.Table:
    if table.is_empty():
        raise ValueError("Cannot group an empty table.")

    if axis == 'feature':
        biom_axis = 'observation'
    else:
        biom_axis = axis

    metadata = _munge_metadata_column(metadata, table.ids(axis=biom_axis),
                                      axis)

    grouped_table = table.collapse(
        lambda axis_id, _: metadata.get_value(axis_id),
        collapse_f=_mode_lookup[mode],
        axis=biom_axis,
        norm=False,
        include_collapsed_metadata=False)
    # Reorder axis by first unique appearance of each group value in metadata
    # (makes it stable for identity mappings and easier to test)
    # TODO use CategoricalMetadataColumn API for retrieving categories/groups,
    # when the API exists.
    series = metadata.to_series()
    return grouped_table.sort_order(series.unique(), axis=biom_axis)
Пример #11
0
def classify_samples(output_dir: str,
                     table: pd.DataFrame,
                     metadata: qiime2.CategoricalMetadataColumn,
                     test_size: float = defaults['test_size'],
                     step: float = defaults['step'],
                     cv: int = defaults['cv'],
                     random_state: int = None,
                     n_jobs: int = defaults['n_jobs'],
                     n_estimators: int = defaults['n_estimators'],
                     estimator: str = defaults['estimator_r'],
                     optimize_feature_selection: bool = False,
                     parameter_tuning: bool = False,
                     palette: str = defaults['palette']) -> None:

    # extract column name from CategoricalMetadataColumn
    column = metadata.to_series().name

    # disable feature selection for unsupported estimators
    optimize_feature_selection, calc_feature_importance = \
        _disable_feature_selection(estimator, optimize_feature_selection)

    # specify parameters and distributions to sample from for parameter tuning
    estimator, param_dist, parameter_tuning = _set_parameters_and_estimator(
        estimator,
        table,
        metadata,
        column,
        n_estimators,
        n_jobs,
        cv,
        random_state,
        parameter_tuning,
        classification=True)

    estimator, cm, accuracy, importances = split_optimize_classify(
        table,
        metadata,
        column,
        estimator,
        output_dir,
        test_size=test_size,
        step=step,
        cv=cv,
        random_state=random_state,
        n_jobs=n_jobs,
        optimize_feature_selection=optimize_feature_selection,
        parameter_tuning=parameter_tuning,
        param_dist=param_dist,
        calc_feature_importance=calc_feature_importance,
        palette=palette)

    _visualize(output_dir,
               estimator,
               cm,
               accuracy,
               importances,
               optimize_feature_selection,
               title='classification predictions')
Пример #12
0
    def test_mixed_orientation_success(self):
        # sample_a and sample_b have reads in both fwd and rev directions.
        # sample_c only has reads in the fwd direction.
        # sample_d only has reads in the rev direction.
        forward_barcodes = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC', 'GGGG', 'TTTT'],
                      name='ForwardBarcode',
                      index=pd.Index(
                          ['sample_a', 'sample_b', 'sample_c', 'sample_d'],
                          name='id')))
        mixed_orientation_sequences_f_fp = self.get_data_path(
            'mixed-orientation/forward.fastq.gz')
        mixed_orientation_sequences_r_fp = self.get_data_path(
            'mixed-orientation/reverse.fastq.gz')
        with tempfile.TemporaryDirectory() as temp:
            shutil.copy(mixed_orientation_sequences_f_fp, temp)
            shutil.copy(mixed_orientation_sequences_r_fp, temp)
            mixed_orientation_sequences = Artifact.import_data(
                'MultiplexedPairedEndBarcodeInSequence', temp)

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_paired_fn(mixed_orientation_sequences,
                                     forward_barcodes=forward_barcodes,
                                     mixed_orientation=True)
        exp = [
            # sample_a fwd
            '@id1\nACGTACGT\n+\nyyyyyyyy\n' \
            '@id3\nACGTACGT\n+\nyyyyyyyy\n',
            # sample_a rev
            '@id1\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n' \
            '@id3\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n',
            # sample_b fwd
            '@id4\nACGTACGT\n+\nyyyyyyyy\n' \
            '@id2\nACGTACGT\n+\nyyyyyyyy\n',
            # sample_b rev
            '@id4\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n' \
            '@id2\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n',
            # sample_c fwd
            '@id5\nACGTACGT\n+\nyyyyyyyy\n',
            # sample_c rev
            '@id5\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n',
            # sample_d fwd
            '@id6\nACGTACGT\n+\nyyyyyyyy\n',
            # sample_d rev
            '@id6\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n', ]

        # We want to be sure that the validation is 100%, not just `min`,
        obs_demuxed_art.validate(level='max')
        # checkpoint assertion for the above `validate` - nothing should fail
        self.assertTrue(True)

        self.assert_demux_results(forward_barcodes.to_series(), exp,
                                  obs_demuxed_art)

        # Everything should match, so untrimmed should be empty
        self.assert_untrimmed_results(['', ''], obs_untrimmed_art)
Пример #13
0
    def test_typical(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b'], name='id')))

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata)

        self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
        self.assert_untrimmed_results(b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
                                      obs_untrimmed_art)
Пример #14
0
def paired_heatmap(output_dir: str,
                   ranks: pd.DataFrame,
                   microbes_table: biom.Table,
                   metabolites_table: biom.Table,
                   features: str = None,
                   top_k_microbes: int = 2,
                   keep_top_samples: bool = True,
                   microbe_metadata: qiime2.CategoricalMetadataColumn = None,
                   normalize: str = 'log10',
                   color_palette: str = 'magma',
                   top_k_metabolites: int = 50,
                   level: int = -1,
                   row_center: bool = True) -> None:
    if microbe_metadata is not None:
        microbe_metadata = microbe_metadata.to_series()

    ranks = ranks.T

    if row_center:
        ranks = ranks - ranks.mean(axis=0)

    select_microbes, select_metabolites, hotmaps = paired_heatmaps(
        ranks, microbes_table, metabolites_table, microbe_metadata, features,
        top_k_microbes, top_k_metabolites, keep_top_samples, level, normalize,
        color_palette)

    hotmaps.savefig(join(output_dir, 'heatmap.pdf'), bbox_inches='tight')
    hotmaps.savefig(join(output_dir, 'heatmap.png'), bbox_inches='tight')
    select_microbes.to_csv(join(output_dir, 'select_microbes.tsv'), sep='\t')
    select_metabolites.to_csv(join(output_dir, 'select_metabolites.tsv'),
                              sep='\t')

    index = join(TEMPLATES, 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'title':
                           'Paired Feature Abundance Heatmaps',
                           'pdf_fp':
                           'heatmap.pdf',
                           'png_fp':
                           'heatmap.png',
                           'table1_fp':
                           'select_microbes.tsv',
                           'download1_text':
                           'Download microbe abundances as TSV',
                           'table2_fp':
                           'select_metabolites.tsv',
                           'download2_text':
                           'Download top k metabolite abundances as TSV'
                       })
Пример #15
0
def aldex2(table: pd.DataFrame,
           metadata: qiime2.CategoricalMetadataColumn,
           mc_samples: int = 128,
           test: str = 't',
           denom: str = 'all') -> pd.DataFrame:

    # create series from the metadata column
    meta = metadata.to_series()

    # The condition is just the only column in the passed metadata column
    condition = metadata.name

    # filter the metadata so only the samples present in the table are used
    # this also reorders it for the correct condition selection
    # it has to be re ordered for aldex to correctly input the conditions
    meta = meta.loc[list(table.index)]

    # force reorder based on the data to ensure conds are selected correctly

    with tempfile.TemporaryDirectory() as temp_dir_name:
        biom_fp = os.path.join(temp_dir_name, 'input.tsv.biom')
        map_fp = os.path.join(temp_dir_name, 'input.map.txt')
        summary_fp = os.path.join(temp_dir_name, 'output.summary.txt')

        # Need to manually specify header=True for Series (i.e. "meta"). It's
        # already the default for DataFrames (i.e. "table"), but we manually
        # specify it here anyway to alleviate any potential confusion.
        table.to_csv(biom_fp, sep='\t', header=True)
        meta.to_csv(map_fp, sep='\t', header=True)

        cmd = [
            'run_aldex2.R', biom_fp, map_fp, condition, mc_samples, test,
            denom, summary_fp
        ]
        cmd = list(map(str, cmd))

        try:
            run_commands([cmd])
        except subprocess.CalledProcessError as e:
            raise Exception("An error was encountered while running ALDEx2"
                            " in R (return code %d), please inspect stdout"
                            " and stderr to learn more." % e.returncode)

        summary = pd.read_csv(summary_fp, index_col=0)
        #differentials = summary[['effect']]
        # hack to fix column name for features because aldex removes
        #it in R because of row.names = 1

        summary.index.name = "featureid"
        summary.rename(index=str, inplace=True)
        return summary
Пример #16
0
    def test_all_matched(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC', 'GGGG'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
                                     name='id')))

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata)

        self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
        # obs_untrimmed should be empty, since everything matched
        self.assert_untrimmed_results(b'', obs_untrimmed_art)
Пример #17
0
def estimate(counts: pd.DataFrame,
             replicates: qiime2.CategoricalMetadataColumn,
             batches: qiime2.CategoricalMetadataColumn,
             monte_carlo_samples: int = 100,
             cores: int = 1) -> az.InferenceData:
    # match everything up
    replicates = replicates.to_series()
    batches = batches.to_series()
    idx = list(set(counts.index) & set(replicates.index) & set(batches.index))
    counts, replicates, batches = [
        x.loc[idx] for x in (counts, replicates, batches)
    ]
    replicates, batches = replicates.values, batches.values
    depth = counts.sum(axis=1)
    pfunc = lambda x: _batch_func(np.array(x.values), replicates, batches,
                                  depth, monte_carlo_samples)
    if cores > 1:
        try:
            import dask.dataframe as dd
            dcounts = dd.from_pandas(counts.T, npartitions=cores)
            res = dcounts.apply(pfunc, axis=1)
            resdf = res.compute(scheduler='processes')
            data_df = list(resdf.values)
        except:
            data_df = list(counts.T.apply(pfunc, axis=1).values)
    else:
        data_df = list(counts.T.apply(pfunc, axis=1).values)

    inf_list = list(resdf[0])
    coords = {
        'features': counts.columns,
        'monte_carlo_samples': np.arange(args.monte_carlo_samples)
    }

    samples = merge_inferences(inf_list, 'y_predict', 'log_lhood', coords)

    return samples
Пример #18
0
    def test_error_tolerance_high_enough_to_prevent_filtering(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['AAAG', 'CCCC'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b'], name='id')))

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata,
                                     error_rate=0.25)

        # This test should yield the same results as test_typical, above
        self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
        self.assert_untrimmed_results(b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
                                      obs_untrimmed_art)
Пример #19
0
def rename_ids(table: biom.Table,
               metadata: qiime2.CategoricalMetadataColumn,
               axis: str = 'sample',
               strict: bool = False)\
               -> biom.Table:

    rename = metadata.to_series()
    if axis == 'feature':
        axis = 'observation'
    old_ids = table.ids(axis=axis)

    new_ids = _generate_new_names(old_ids, rename, strict, False)

    updated = table.update_ids(new_ids, axis=axis, inplace=False)

    return updated
Пример #20
0
    def test_variable_length_barcodes(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['AAAAA', 'CCCCCC', 'GGGG'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
                                     name='id')))
        muxed_sequences_fp = self.get_data_path('variable_length.fastq.gz')
        muxed_sequences = Artifact.import_data(
            'MultiplexedSingleEndBarcodeInSequence', muxed_sequences_fp)

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(muxed_sequences, metadata)

        # This test should yield the same results as test_typical, above, just
        # with variable length barcodes
        self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
        self.assert_untrimmed_results(b'', obs_untrimmed_art)
Пример #21
0
    def test_none_matched(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['TTTT'],
                      name='Barcode',
                      index=pd.Index(['sample_d'], name='id')))

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata)

        self.assert_demux_results(metadata.to_series(), [''], obs_demuxed_art)
        self.assert_untrimmed_results(
            '@id1\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n'
            '@id2\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n'
            '@id3\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n'
            '@id4\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n'
            '@id5\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n'
            '@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n', obs_untrimmed_art)
Пример #22
0
    def test_di_typical(self):
        forward_barcodes = CategoricalMetadataColumn(
            pd.Series(['AAAA', 'CCCC'],
                      name='ForwardBarcode',
                      index=pd.Index(['sample_a', 'sample_b'], name='id')))
        reverse_barcodes = CategoricalMetadataColumn(
            pd.Series(['GGGG', 'TTTT'],
                      name='ReverseBarcode',
                      index=pd.Index(['sample_a', 'sample_b'], name='id')))

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_paired_fn(self.muxed_sequences,
                                     forward_barcodes=forward_barcodes,
                                     reverse_barcodes=reverse_barcodes)

        self.assert_demux_results(forward_barcodes.to_series(),
                                  obs_demuxed_art)
        exp_untrimmed = [
            b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
            b'@id6\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
        ]
        self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
Пример #23
0
    def test_error_tolerance_filtering(self):
        metadata = CategoricalMetadataColumn(
            pd.Series(['AAAG', 'CCCC'],
                      name='Barcode',
                      index=pd.Index(['sample_a', 'sample_b'], name='id')))
        exp = [
            # sample a has no reads (bc we misspelled the barcode)
            '',
            # sample b
            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
        ]

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata)

        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
        self.assert_untrimmed_results(
            '@id1\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n'
            '@id3\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n'
            '@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n', obs_untrimmed_art)
Пример #24
0
def ancom(output_dir: str,
          table: pd.DataFrame,
          metadata: qiime2.CategoricalMetadataColumn,
          transform_function: str = 'clr',
          difference_function: str = None) -> None:

    index_fp = os.path.join(output_dir, 'index.html')

    metadata = metadata.filter_ids(table.index)
    if metadata.has_missing_values():
        missing_data_sids = metadata.get_ids(where_values_missing=True)
        missing_data_sids = ', '.join(sorted(missing_data_sids))
        raise ValueError('Metadata column is missing values for the '
                         'following samples. Values need to be added for '
                         'these samples, or the samples need to be removed '
                         'from the table: %s' % missing_data_sids)

    ancom_results = skbio_ancom(table,
                                metadata.to_series(),
                                significance_test=f_oneway)
    # scikit-bio 0.4.2 returns a single tuple from ancom, and scikit-bio 0.5.0
    # returns two tuples. We want to support both scikit-bio versions, so we
    # tuplize ancom_result to support both. Similarly, the "reject" column
    # was renamed in scikit-bio 0.5.0, so we apply a rename here (which does
    # nothing if a column called "reject" isn't found).
    ancom_results = qiime2.core.util.tuplize(ancom_results)
    ancom_results[0].sort_values(by='W', ascending=False)
    ancom_results[0].rename(columns={'reject': 'Reject null hypothesis'},
                            inplace=True)

    ancom_results[0].to_csv(os.path.join(output_dir, 'ancom.csv'),
                            header=True,
                            index=True)

    html = _volcanoplot(output_dir, table, metadata, ancom_results[0],
                        transform_function, difference_function)

    significant_features = ancom_results[0][ancom_results[0]
                                            ['Reject null hypothesis']]
    significant_features_present = not significant_features.empty
    insignificant_div = ('<div>No significant features identified!</div>')

    with open(index_fp, 'w') as index_f:
        index_f.write('<html>\n')
        if html is not None:
            index_f.write('<head>\n')
            index_f.write(INLINE.render())
            index_f.write('</head>\n')
        index_f.write('<body>\n')
        index_f.write('<h1>ANCOM statistical results</h1>\n')
        index_f.write('<a href="ancom.csv">Download complete table as CSV</a>'
                      '<br>\n')
        if significant_features_present:
            index_f.write(
                q2templates.df_to_html(significant_features['W'].to_frame(),
                                       border=None,
                                       classes=None))
        else:
            index_f.write(insignificant_div)
        if len(ancom_results) == 2:
            ancom_results[1].to_csv(os.path.join(output_dir,
                                                 'percent-abundances.csv'),
                                    header=True,
                                    index=True)
            index_f.write(('<h1>Percentile abundances of features '
                           'by group</h1>\n'))
            index_f.write(('<a href="percent-abundances.csv">'
                           'Download complete table as CSV</a><br>\n'))
            if significant_features_present:
                index_f.write(
                    q2templates.df_to_html(
                        ancom_results[1].loc[significant_features.index],
                        border=None,
                        classes=None))
            else:
                index_f.write(insignificant_div)
        if html is not None:
            index_f.write(html[1])
            index_f.write(html[0])
        else:
            index_f.write('<p>Unable to generate volcano plot, please check '
                          'the ANCOM statistical results (above).</p>\n')
        index_f.write('</body></html>\n')
Пример #25
0
def beta_group_significance(output_dir: str,
                            distance_matrix: skbio.DistanceMatrix,
                            metadata: qiime2.CategoricalMetadataColumn,
                            method: str = 'permanova',
                            pairwise: bool = False,
                            permutations: int = 999) -> None:
    try:
        beta_group_significance_fn = _beta_group_significance_fns[method]
    except KeyError:
        raise ValueError('Unknown group significance method %s. The available '
                         'options are %s.' %
                         (method, ', '.join(_beta_group_significance_fns)))

    # Filter metadata to only include IDs present in the distance matrix.
    # Also ensures every distance matrix ID is present in the metadata.
    metadata = metadata.filter_ids(distance_matrix.ids)
    metadata = metadata.drop_missing_values()

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata due to missing values, and keep track of how many samples
    # survived the filtering so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(metadata.ids)
    filtered_dm_length = distance_matrix.shape[0]

    metadata = metadata.to_series()

    # Run the significance test
    result = beta_group_significance_fn(distance_matrix,
                                        metadata,
                                        permutations=permutations)

    # Generate distance boxplots
    sns.set_style('white')
    # Identify the groups, then compute the within group distances and the
    # between group distances, and generate one boxplot per group.
    # groups will be an OrderedDict mapping group id to the sample ids in that
    # group. The order is used both on the x-axis, and in the layout of the
    # boxplots in the visualization.
    # TODO: update to use a grouping API and natsort API on
    # CategoricalMetadataColumn, if those become available.
    groupings = collections.OrderedDict([
        (id, list(series.index))
        for id, series in natsorted(metadata.groupby(metadata))
    ])

    pairs_summary = pd.DataFrame(
        columns=['SubjectID1', 'SubjectID2', 'Group1', 'Group2', 'Distance'])
    for group_id in groupings:
        group_distances, x_ticklabels, group_pairs_summary = \
            _get_distance_boxplot_data(distance_matrix, group_id, groupings)

        group_pairs_summary = pd.DataFrame(group_pairs_summary,
                                           columns=[
                                               'SubjectID1', 'SubjectID2',
                                               'Group1', 'Group2', 'Distance'
                                           ])

        pairs_summary = pd.concat([pairs_summary, group_pairs_summary])

        ax = sns.boxplot(data=group_distances,
                         flierprops={
                             'marker': 'o',
                             'markeredgecolor': 'black',
                             'markeredgewidth': 0.5,
                             'alpha': 0.5
                         })
        ax.set_xticklabels(x_ticklabels, rotation=90)
        ax.set_xlabel('Group')
        ax.set_ylabel('Distance')
        ax.set_title('Distances to %s' % group_id)
        # change the color of the boxes to white
        for box in ax.artists:
            box.set_facecolor('white')
        sns.despine()
        plt.tight_layout()
        fig = ax.get_figure()
        fig.savefig(
            os.path.join(output_dir, '%s-boxplots.png' %
                         urllib.parse.quote(str(group_id))))
        fig.savefig(
            os.path.join(output_dir, '%s-boxplots.pdf' %
                         urllib.parse.quote(str(group_id))))
        fig.clear()

    pairs_summary.to_csv(os.path.join(output_dir, 'raw_data.tsv'), sep='\t')

    result_html = q2templates.df_to_html(result.to_frame())

    if pairwise:
        pairwise_results = []
        for group1_id, group2_id in itertools.combinations(groupings, 2):
            pairwise_result = \
                _get_pairwise_group_significance_stats(
                    distance_matrix=distance_matrix,
                    group1_id=group1_id,
                    group2_id=group2_id,
                    groupings=groupings,
                    metadata=metadata,
                    beta_group_significance_fn=beta_group_significance_fn,
                    permutations=permutations)
            pairwise_results.append([
                group1_id, group2_id, pairwise_result['sample size'],
                permutations, pairwise_result['test statistic'],
                pairwise_result['p-value']
            ])
        columns = [
            'Group 1', 'Group 2', 'Sample size', 'Permutations',
            result['test statistic name'], 'p-value'
        ]
        pairwise_results = pd.DataFrame(pairwise_results, columns=columns)
        pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True)
        pairwise_results['q-value'] = multipletests(
            pairwise_results['p-value'], method='fdr_bh')[1]
        pairwise_results.sort_index(inplace=True)
        pairwise_path = os.path.join(output_dir, '%s-pairwise.csv' % method)
        pairwise_results.to_csv(pairwise_path)

        pairwise_results_html = q2templates.df_to_html(pairwise_results)
    else:
        pairwise_results_html = None

    # repartition groupings for rendering
    group_ids = [
        # We have to DOUBLE encode this, as the file/resource name is a literal
        # URI-encoded string, we do this to prevent issues with the filesystem
        # however, as a result, our links need to escape % so that the browser
        # asks for the right escaped name (instead of the original name, which
        # doesn't exist inside the visualization).
        urllib.parse.quote(urllib.parse.quote(k)) for k in groupings.keys()
    ]
    row_count, group_count = 3, len(group_ids)  # Start at three plots per row
    while group_count % row_count != 0:
        row_count = row_count - 1

    group_rows = [
        group_ids[g:g + row_count] for g in range(0, group_count, row_count)
    ]

    index = os.path.join(TEMPLATES, 'beta_group_significance_assets',
                         'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'initial_dm_length': initial_dm_length,
                           'filtered_dm_length': filtered_dm_length,
                           'method': method,
                           'group_rows': group_rows,
                           'bootstrap_group_col_size': int(12 / row_count),
                           'result': result_html,
                           'pairwise_results': pairwise_results_html
                       })
Пример #26
0
def ancom(output_dir: str,
          table: pd.DataFrame,
          metadata: qiime2.CategoricalMetadataColumn,
          transform_function: str = 'clr',
          difference_function: str = None) -> None:
    metadata = metadata.filter_ids(table.index)
    if metadata.has_missing_values():
        missing_data_sids = metadata.get_ids(where_values_missing=True)
        missing_data_sids = ', '.join(sorted(missing_data_sids))
        raise ValueError('Metadata column is missing values for the '
                         'following samples. Values need to be added for '
                         'these samples, or the samples need to be removed '
                         'from the table: %s' % missing_data_sids)
    ancom_results = skbio_ancom(table,
                                metadata.to_series(),
                                significance_test=f_oneway)
    ancom_results[0].sort_values(by='W', ascending=False, inplace=True)
    ancom_results[0].rename(columns={'reject': 'Reject null hypothesis'},
                            inplace=True)
    significant_features = ancom_results[0][ancom_results[0]
                                            ['Reject null hypothesis']]

    context = dict()
    if not significant_features.empty:
        context['significant_features'] = q2templates.df_to_html(
            significant_features['W'].to_frame())
        context['percent_abundances'] = q2templates.df_to_html(
            ancom_results[1].loc[significant_features.index])

    metadata = metadata.to_series()
    cats = list(set(metadata))
    transform_function_name = transform_function
    transform_function = _transform_functions[transform_function]
    transformed_table = table.apply(transform_function,
                                    axis=1,
                                    result_type='broadcast')

    if difference_function is None:
        if len(cats) == 2:
            difference_function = 'mean_difference'
        else:  # len(categories) > 2
            difference_function = 'f_statistic'

    _d_func = _difference_functions[difference_function]

    def diff_func(x):
        args = _d_func(*[x[metadata == c] for c in cats])
        if isinstance(args, tuple):
            return args[0]
        else:
            return args

    # effectively doing a groupby operation wrt to the metadata
    fold_change = transformed_table.apply(diff_func, axis=0)
    if not pd.isnull(fold_change).all():
        pre_filtered_ids = set(fold_change.index)
        with pd.option_context('mode.use_inf_as_na', True):
            fold_change = fold_change.dropna(axis=0)
        filtered_ids = pre_filtered_ids - set(fold_change.index)
        filtered_ancom_results = ancom_results[0].drop(labels=filtered_ids)

        volcano_results = pd.DataFrame({
            transform_function_name: fold_change,
            'W': filtered_ancom_results.W
        })
        volcano_results.index.name = 'id'
        volcano_results.to_csv(os.path.join(output_dir, 'data.tsv'),
                               header=True,
                               index=True,
                               sep='\t')
        volcano_results = volcano_results.reset_index(drop=False)

        spec = {
            '$schema':
            'https://vega.github.io/schema/vega/v4.json',
            'width':
            300,
            'height':
            300,
            'data': [{
                'name': 'values',
                'values': volcano_results.to_dict(orient='records')
            }],
            'scales': [{
                'name': 'xScale',
                'domain': {
                    'data': 'values',
                    'field': transform_function_name
                },
                'range': 'width'
            }, {
                'name': 'yScale',
                'domain': {
                    'data': 'values',
                    'field': 'W'
                },
                'range': 'height'
            }],
            'axes': [{
                'scale': 'xScale',
                'orient': 'bottom',
                'title': transform_function_name
            }, {
                'scale': 'yScale',
                'orient': 'left',
                'title': 'W'
            }],
            'marks': [{
                'type': 'symbol',
                'from': {
                    'data': 'values'
                },
                'encode': {
                    'hover': {
                        'fill': {
                            'value': '#FF0000'
                        },
                        'opacity': {
                            'value': 1
                        }
                    },
                    'enter': {
                        'x': {
                            'scale': 'xScale',
                            'field': transform_function_name
                        },
                        'y': {
                            'scale': 'yScale',
                            'field': 'W'
                        }
                    },
                    'update': {
                        'fill': {
                            'value': 'black'
                        },
                        'opacity': {
                            'value': 0.3
                        },
                        'tooltip': {
                            'signal':
                            "{{'title': datum['id'], '{0}': "
                            "datum['{0}'], 'W': datum['W']}}".format(
                                transform_function_name)
                        }
                    }
                }
            }]
        }
        context['vega_spec'] = json.dumps(spec)
        if filtered_ids:
            context['filtered_ids'] = ', '.join(sorted(filtered_ids))

    copy_tree(os.path.join(TEMPLATES, 'ancom'), output_dir)
    ancom_results[0].to_csv(os.path.join(output_dir, 'ancom.tsv'),
                            header=True,
                            index=True,
                            sep='\t')
    ancom_results[1].to_csv(os.path.join(output_dir, 'percent-abundances.tsv'),
                            header=True,
                            index=True,
                            sep='\t')
    index = os.path.join(TEMPLATES, 'ancom', 'index.html')
    q2templates.render(index, output_dir, context=context)
Пример #27
0
def beta_group_significance(output_dir: str,
                            distance_matrix: skbio.DistanceMatrix,
                            metadata: qiime2.CategoricalMetadataColumn,
                            method: str = 'permanova',
                            pairwise: bool = False,
                            permutations: int = 999) -> None:
    try:
        beta_group_significance_fn = _beta_group_significance_fns[method]
    except KeyError:
        raise ValueError('Unknown group significance method %s. The available '
                         'options are %s.' %
                         (method,
                          ', '.join(_beta_group_significance_fns)))

    # Filter metadata to only include IDs present in the distance matrix.
    # Also ensures every distance matrix ID is present in the metadata.
    metadata = metadata.filter_ids(distance_matrix.ids)
    metadata = metadata.drop_missing_values()

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata due to missing values, and keep track of how many samples
    # survived the filtering so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(metadata.ids)
    filtered_dm_length = distance_matrix.shape[0]

    metadata = metadata.to_series()

    # Run the significance test
    result = beta_group_significance_fn(distance_matrix, metadata,
                                        permutations=permutations)

    # Generate distance boxplots
    sns.set_style('white')
    # Identify the groups, then compute the within group distances and the
    # between group distances, and generate one boxplot per group.
    # groups will be an OrderedDict mapping group id to the sample ids in that
    # group. The order is used both on the x-axis, and in the layout of the
    # boxplots in the visualization.
    # TODO: update to use a grouping API and natsort API on
    # CategoricalMetadataColumn, if those become available.
    groupings = collections.OrderedDict(
        [(id, list(series.index))
         for id, series in natsorted(metadata.groupby(metadata))])

    pairs_summary = pd.DataFrame(columns=['SubjectID1', 'SubjectID2', 'Group1',
                                          'Group2', 'Distance'])
    for group_id in groupings:
        group_distances, x_ticklabels, group_pairs_summary = \
            _get_distance_boxplot_data(distance_matrix, group_id, groupings)

        group_pairs_summary = pd.DataFrame(
            group_pairs_summary, columns=['SubjectID1', 'SubjectID2',
                                          'Group1', 'Group2', 'Distance'])

        pairs_summary = pd.concat([pairs_summary, group_pairs_summary])

        ax = sns.boxplot(data=group_distances, flierprops={
            'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5,
            'alpha': 0.5})
        ax.set_xticklabels(x_ticklabels, rotation=90)
        ax.set_xlabel('Group')
        ax.set_ylabel('Distance')
        ax.set_title('Distances to %s' % group_id)
        # change the color of the boxes to white
        for box in ax.artists:
            box.set_facecolor('white')
        sns.despine()
        plt.tight_layout()
        fig = ax.get_figure()
        fig.savefig(os.path.join(output_dir, '%s-boxplots.png' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.clear()

    pairs_summary.to_csv(os.path.join(output_dir, 'raw_data.tsv'), sep='\t')

    result_html = q2templates.df_to_html(result.to_frame())

    if pairwise:
        pairwise_results = []
        for group1_id, group2_id in itertools.combinations(groupings, 2):
            pairwise_result = \
                _get_pairwise_group_significance_stats(
                    distance_matrix=distance_matrix,
                    group1_id=group1_id,
                    group2_id=group2_id,
                    groupings=groupings,
                    metadata=metadata,
                    beta_group_significance_fn=beta_group_significance_fn,
                    permutations=permutations)
            pairwise_results.append([group1_id,
                                     group2_id,
                                     pairwise_result['sample size'],
                                     permutations,
                                     pairwise_result['test statistic'],
                                     pairwise_result['p-value']])
        columns = ['Group 1', 'Group 2', 'Sample size', 'Permutations',
                   result['test statistic name'], 'p-value']
        pairwise_results = pd.DataFrame(pairwise_results, columns=columns)
        pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True)
        pairwise_results['q-value'] = multipletests(
            pairwise_results['p-value'], method='fdr_bh')[1]
        pairwise_results.sort_index(inplace=True)
        pairwise_path = os.path.join(
            output_dir, '%s-pairwise.csv' % method)
        pairwise_results.to_csv(pairwise_path)

        pairwise_results_html = q2templates.df_to_html(pairwise_results)
    else:
        pairwise_results_html = None

    # repartition groupings for rendering
    group_ids = list(groupings.keys())
    row_count, group_count = 3, len(group_ids)  # Start at three plots per row
    while group_count % row_count != 0:
        row_count = row_count - 1

    group_rows = [group_ids[g:g+row_count] for g in range(0, group_count,
                                                          row_count)]

    index = os.path.join(
        TEMPLATES, 'beta_group_significance_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'method': method,
        'group_rows': group_rows,
        'bootstrap_group_col_size': int(12 / row_count),
        'result': result_html,
        'pairwise_results': pairwise_results_html
    })
Пример #28
0
def classify(
    features: pd.DataFrame,
    y: qiime2.CategoricalMetadataColumn,
    c: np.ndarray = None,
    weights: np.ndarray = None,
    # taxa: skbio.TreeNode = None,
    # PATH parameters :
    path: bool = True,
    path_numerical_method: str = "not specified",
    path_n_active: int = 0,
    path_nlam_log: int = 40,
    path_lamin_log: float = 1e-2,
    # CV parameters :
    cv: bool = True,
    cv_numerical_method: str = "not specified",
    cv_seed: int = 1,
    cv_one_se: bool = True,
    cv_subsets: int = 5,
    cv_nlam: int = 100,
    cv_lamin: float = 1e-3,
    cv_logscale: bool = True,
    # StabSel parameters :
    stabsel: bool = True,
    stabsel_numerical_method: str = "not specified",
    stabsel_seed: int = None,  # do something here ! for now it can be a bool !
    stabsel_lam: float = -1.0,  # if negative, then it means 'theoretical'
    stabsel_true_lam: bool = True,
    stabsel_method: str = "first",
    stabsel_b: int = 50,
    stabsel_q: int = 10,
    stabsel_percent_ns: float = 0.5,
    stabsel_lamin: float = 1e-2,
    stabsel_threshold: float = 0.7,
    stabsel_threshold_label: float = 0.4,
    # might unneeded here, but needed for visualisation
    # LAMfixed parameters :
    lamfixed: bool = True,
    lamfixed_numerical_method: str = "not specified",
    lamfixed_lam: float = -1.0,  # if negative, then it means 'theoretical'
    lamfixed_true_lam: bool = True,
    # Formulation parameters
    huber: bool = False,
    rho: float = 0.0,
    intercept: bool = True,
) -> classo_problem:

    complete_y = y.to_series()
    complete_y = complete_y[~complete_y.isna()]
    first_cell = complete_y[0]

    #print(sum(complete_y==complete_y[0]), len(complete_y))

    features, pdY = features.align(y.to_series(), join="inner", axis=0)
    missing = pdY.isna()
    training_labels = list(pdY[~missing].index)
    label_missing = list(pdY.index[missing])
    if label_missing:
        print("{} are missing in y ".format(label_missing))
    Y = pdY[~missing].to_numpy()
    X = features.values[~missing, :]

    verfify_binary(Y)
    Y = Y == first_cell
    Y = 2 * Y - 1

    problem = classo_problem(X, Y, C=c, label=list(features.columns))
    problem.formulation.classification = True
    problem.formulation.concomitant = False
    problem.formulation.huber = huber
    #print(rho)
    problem.formulation.rho_classification = rho
    problem.formulation.intercept = intercept
    d = X.shape[1]
    if weights is not None:
        if len(weights) < d:
            problem.formulation.w = np.concatenate(
                [weights, np.ones(d - len(weights))], axis=0)
        else:
            problem.formulation.w = weights[:d]

    problem.model_selection.PATH = path
    if path:
        param = problem.model_selection.PATHparameters
        param.numerical_method = path_numerical_method
        param.n_active = path_n_active
        param.logscale = True
        param.Nlam = path_nlam_log
        param.lamin = path_lamin_log

    problem.model_selection.CV = cv
    if cv:
        param = problem.model_selection.CVparameters
        param.numerical_method = cv_numerical_method
        param.seed = cv_seed
        param.oneSE = cv_one_se
        param.Nsubsets = cv_subsets
        param.lamin = cv_lamin
        param.Nlam = cv_nlam
        param.logscale = cv_logscale

    problem.model_selection.StabSel = stabsel
    if stabsel:
        param = problem.model_selection.StabSelparameters
        param.numerical_method = stabsel_numerical_method
        param.seed = stabsel_seed
        param.true_lam = stabsel_true_lam
        param.method = stabsel_method
        param.B = stabsel_b
        param.q = stabsel_q
        param.percent_nS = stabsel_percent_ns
        param.lamin = stabsel_lamin
        param.threshold = stabsel_threshold
        param.threshold_label = stabsel_threshold_label
        if stabsel_lam > 0.0:
            param.lam = stabsel_lam
        else:
            param.lam = "theoretical"

    problem.model_selection.LAMfixed = lamfixed
    if lamfixed:
        param = problem.model_selection.LAMfixedparameters
        param.numerical_method = lamfixed_numerical_method
        param.true_lam = lamfixed_true_lam
        if lamfixed_lam > 0.0:
            param.lam = lamfixed_lam
        else:
            param.lam = "theoretical"

    problem.solve()

    cy = complete_y.values
    problem.data.complete_y = 2 * (cy == cy[0]) - 1
    problem.data.complete_labels = list(complete_y.index)
    problem.data.training_labels = training_labels

    return problem
Пример #29
0
def dendrogram_heatmap(output_dir: str, table: pd.DataFrame,
                       tree: TreeNode,
                       metadata: qiime2.CategoricalMetadataColumn,
                       pseudocount: float = 0.5,
                       ndim: int = 10, method: str = 'clr',
                       color_map: str = 'viridis'):

    table, tree = match_tips(add_pseudocount(table, pseudocount), tree)
    nodes = [n.name for n in tree.levelorder() if not n.is_tip()]

    nlen = min(ndim, len(nodes))
    numerator_color, denominator_color = '#fb9a99', '#e31a1c'
    highlights = pd.DataFrame([[numerator_color, denominator_color]] * nlen,
                              index=nodes[:nlen])
    if method == 'clr':
        mat = pd.DataFrame(clr(centralize(table)),
                           index=table.index,
                           columns=table.columns)
    elif method == 'log':
        mat = pd.DataFrame(np.log(table),
                           index=table.index,
                           columns=table.columns)
    c = metadata.to_series()
    table, c = match(table, c)
    # TODO: There are a few hard-coded constants here
    # will need to have some adaptive defaults set in the future
    fig = heatmap(mat, tree, c, highlights, cmap=color_map,
                  highlight_width=0.01, figsize=(12, 8))
    fig.savefig(os.path.join(output_dir, 'heatmap.svg'))
    fig.savefig(os.path.join(output_dir, 'heatmap.pdf'))

    css = r"""
        .square {
          float: left;
          width: 100px;
          height: 20px;
          margin: 5px;
          border: 1px solid rgba(0, 0, 0, .2);
        }

        .numerator {
          background: %s;
        }

        .denominator {
          background: %s;
        }
    """ % (numerator_color, denominator_color)

    index_fp = os.path.join(output_dir, 'index.html')
    with open(index_fp, 'w') as index_f:
        index_f.write('<html><body>\n')
        index_f.write('<h1>Dendrogram heatmap</h1>\n')
        index_f.write('<img src="heatmap.svg" alt="heatmap">')
        index_f.write('<a href="heatmap.pdf">')
        index_f.write('Download as PDF</a><br>\n')
        index_f.write('<style>%s</style>' % css)
        index_f.write('<div class="square numerator">'
                      'Numerator<br/></div>')
        index_f.write('<div class="square denominator">'
                      'Denominator<br/></div>')
        index_f.write('</body></html>\n')
Пример #30
0
def ancom(output_dir: str,
          table: pd.DataFrame,
          metadata: qiime2.CategoricalMetadataColumn,
          transform_function: str = 'clr',
          difference_function: str = None) -> None:
    metadata = metadata.filter_ids(table.index)
    if metadata.has_missing_values():
        missing_data_sids = metadata.get_ids(where_values_missing=True)
        missing_data_sids = ', '.join(sorted(missing_data_sids))
        raise ValueError('Metadata column is missing values for the '
                         'following samples. Values need to be added for '
                         'these samples, or the samples need to be removed '
                         'from the table: %s' % missing_data_sids)
    ancom_results = skbio_ancom(table,
                                metadata.to_series(),
                                significance_test=f_oneway)
    ancom_results[0].sort_values(by='W', ascending=False, inplace=True)
    ancom_results[0].rename(columns={'reject': 'Reject null hypothesis'},
                            inplace=True)
    significant_features = ancom_results[0][
        ancom_results[0]['Reject null hypothesis']]

    context = dict()
    if not significant_features.empty:
        context['significant_features'] = q2templates.df_to_html(
            significant_features['W'].to_frame())
        context['percent_abundances'] = q2templates.df_to_html(
            ancom_results[1].loc[significant_features.index])

    metadata = metadata.to_series()
    cats = list(set(metadata))
    transform_function_name = transform_function
    transform_function = _transform_functions[transform_function]
    transformed_table = table.apply(
        transform_function, axis=1, result_type='broadcast')

    if difference_function is None:
        if len(cats) == 2:
            difference_function = 'mean_difference'
        else:  # len(categories) > 2
            difference_function = 'f_statistic'

    _d_func = _difference_functions[difference_function]

    def diff_func(x):
        args = _d_func(*[x[metadata == c] for c in cats])
        if isinstance(args, tuple):
            return args[0]
        else:
            return args
    # effectively doing a groupby operation wrt to the metadata
    fold_change = transformed_table.apply(diff_func, axis=0)
    if not pd.isnull(fold_change).all():
        volcano_results = pd.DataFrame({transform_function_name: fold_change,
                                        'W': ancom_results[0].W})
        volcano_results = volcano_results.reset_index(drop=False)

        spec = {
            '$schema': 'https://vega.github.io/schema/vega/v4.json',
            'width': 300,
            'height': 300,
            'data': [
                {'name': 'values',
                 'values': volcano_results.to_dict(orient='records')}],
            'scales': [
                {'name': 'xScale',
                 'domain': {'data': 'values',
                            'field': transform_function_name},
                 'range': 'width'},
                {'name': 'yScale',
                 'domain': {'data': 'values', 'field': 'W'},
                 'range': 'height'}],
            'axes': [
                {'scale': 'xScale', 'orient': 'bottom',
                 'title': transform_function_name},
                {'scale': 'yScale', 'orient': 'left', 'title': 'W'}],
            'marks': [
              {'type': 'symbol',
               'from': {'data': 'values'},
               'encode': {
                   'hover': {
                       'fill': {'value': '#FF0000'},
                       'opacity': {'value': 1}},
                   'enter': {
                       'x': {'scale': 'xScale',
                             'field': transform_function_name},
                       'y': {'scale': 'yScale', 'field': 'W'}},
                   'update': {
                       'fill': {'value': 'black'},
                       'opacity': {'value': 0.3},
                       'tooltip': {
                           'signal': "{{'title': datum['index'], '{0}': "
                                     "datum['{0}'], 'W': datum['W']}}".format(
                                         transform_function_name)}}}}]}
        context['vega_spec'] = json.dumps(spec)

    copy_tree(os.path.join(TEMPLATES, 'ancom'), output_dir)
    ancom_results[0].to_csv(os.path.join(output_dir, 'ancom.tsv'),
                            header=True, index=True, sep='\t')
    ancom_results[1].to_csv(os.path.join(output_dir,
                                         'percent-abundances.tsv'),
                            header=True, index=True, sep='\t')
    index = os.path.join(TEMPLATES, 'ancom', 'index.html')
    q2templates.render(index, output_dir, context=context)