예제 #1
0
    def setUp(self):
        self.results = "results"
        if not os.path.exists(self.results):
            os.mkdir(self.results)
        self.balances = pd.DataFrame(
            {
                'a': [-2, -1, 0, 1, 2],
                'b': [-2, 0, 0, 0, 0]
            },
            index=['a1', 'a2', 'a3', 'a4', 'a5'])
        self.tree = TreeNode.read([r'((k, q)d, ((x, y)a, z)b)c;'])
        self.taxonomy = pd.DataFrame(
            [['foo;barf;a;b;c;d;e', 1], ['foo;bark;f;g;h;i;j', 1],
             ['foo;bark;f;g;h;w;j', 1], ['nom;tu;k;l;m;n;o', 0.9],
             ['nom;tu;k;l;m;t;o', 0.9]],
            columns=['Taxon', 'Confidence'],
            index=['x', 'y', 'z', 'k', 'q'])

        self.balances = pd.DataFrame(
            [[1, 2, 3, 4, 5, 6, 7], [-3.1, -2.9, -3, 3, 2.9, 3.2, 3.1],
             [1, 1, 1, 1, 1, 1, 1], [3, 2, 1, 0, -1, -2, -3]],
            index=['d', 'a', 'b', 'c'],
            columns=['s1', 's2', 's3', 's4', 's5', 's6', 's7']).T

        self.categorical = MetadataCategory(
            pd.Series(['a', 'a', 'a', 'b', 'b', 'b', 'b'],
                      index=['s1', 's2', 's3', 's4', 's5', 's6', 's7'],
                      name='categorical'))
        self.continuous = MetadataCategory(
            pd.Series(np.arange(7),
                      index=['s1', 's2', 's3', 's4', 's5', 's6', 's7'],
                      name='continuous'))
예제 #2
0
    def test_all_matched(self):
        metadata = MetadataCategory(
            pd.Series(['AAAA', 'CCCC', 'GGGG'],
                      index=['sample_a', 'sample_b', 'sample_c'],
                      name='Barcode'))

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata)

        self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
        # obs_untrimmed should be empty, since everything matched
        self.assert_untrimmed_results(b'', obs_untrimmed_art)
예제 #3
0
    def test_typical(self):
        metadata = MetadataCategory(
            pd.Series(['AAAA', 'CCCC'],
                      index=['sample_a', 'sample_b'],
                      name='Barcode'))

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata)

        self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
        self.assert_untrimmed_results(b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
                                      obs_untrimmed_art)
예제 #4
0
    def test_error_tolerance_high_enough_to_prevent_filtering(self):
        metadata = MetadataCategory(
            pd.Series(['AAAG', 'CCCC'],
                      index=['sample_a', 'sample_b'],
                      name='Barcode'))

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata,
                                     error_tolerance=0.25)

        # This test should yield the same results as test_typical, above
        self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
        self.assert_untrimmed_results(b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
                                      obs_untrimmed_art)
예제 #5
0
def distance_matrix(metadata: qiime2.MetadataCategory) -> skbio.DistanceMatrix:
    try:
        series = pd.to_numeric(metadata.to_series(), errors='raise')
    except ValueError as e:
        raise ValueError(
            "Encountered non-numeric values in the metadata cateogry. A "
            "distance matrix can only be computed from numeric metadata. "
            "Original error message:\n\n%s" % e)

    # TODO this check can be removed when MetadataCategory is no longer allowed
    # to be empty
    if series.empty:
        raise ValueError(
            "Encountered metadata category that is empty, i.e. there are no "
            "samples or features in the metadata to compute distances "
            "between.")

    if series.hasnans:
        raise ValueError(
            "Encountered missing value(s) in the metadata category. Computing "
            "a distance matrix from missing values is not supported.")

    # This code is derived from @jairideout's scikit-bio cookbook recipe,
    # "Exploring Microbial Community Diversity"
    # https://github.com/biocore/scikit-bio-cookbook
    distances = scipy.spatial.distance.pdist(series.values[:, np.newaxis],
                                             metric='euclidean')
    return skbio.DistanceMatrix(distances, ids=series.index)
예제 #6
0
    def test_none_matched(self):
        metadata = MetadataCategory(
            pd.Series(['TTTT'], index=['sample_d'], name='Barcode'))

        with redirected_stdio(stderr=os.devnull):
            with self.assertRaisesRegex(ValueError, 'demultiplexed'):
                self.demux_single_fn(self.muxed_sequences, metadata)
예제 #7
0
    def test_heatmap_extra_tips(self):
        # Adds in test scenario where there more tips than features
        # in the table
        np.random.seed(0)
        num_otus = 11  # otus
        index = np.arange(5).astype(np.str)
        table = pd.DataFrame(np.random.random((len(index), num_otus)),
                             index=index,
                             columns=np.arange(num_otus).astype(np.str))

        x = np.random.rand(num_otus*2)
        dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y))
        lm = ward(dm.condensed_form())
        t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str))

        for i, n in enumerate(t.postorder()):
            if not n.is_tip():
                n.name = "y%d" % i
            n.length = np.random.rand()*3

        md = MetadataCategory(
            pd.Series(['a', 'a', 'a', 'b', 'b'], index=index))

        dendrogram_heatmap(self.results, table, t, md)

        index_fp = os.path.join(self.results, 'index.html')
        self.assertTrue(os.path.exists(index_fp))

        with open(index_fp, 'r') as fh:
            html = fh.read()
            self.assertIn('<h1>Dendrogram heatmap</h1>',
                          html)
예제 #8
0
    def test_visualization_garbage_metadata(self):
        # tests the scenario where ndim > number of tips
        np.random.seed(0)
        num_otus = 10  # otus
        num_samples = 5
        table = pd.DataFrame(np.random.random((num_samples, num_otus)),
                             index=np.arange(num_samples).astype(np.str),
                             columns=np.arange(num_otus).astype(np.str))

        x = np.random.rand(num_otus)
        dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y))
        lm = ward(dm.condensed_form())
        t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str))

        for i, n in enumerate(t.postorder()):
            if not n.is_tip():
                n.name = "y%d" % i
            n.length = np.random.rand()*3

        md = MetadataCategory(
            pd.Series(['a', 'a', 'a', 'b', 'b', 'foo', 'foo'],
                      index=np.arange(7).astype(np.str)))

        dendrogram_heatmap(self.results, table, t, md)

        index_fp = os.path.join(self.results, 'index.html')
        self.assertTrue(os.path.exists(index_fp))

        with open(index_fp, 'r') as fh:
            html = fh.read()
            self.assertIn('<h1>Dendrogram heatmap</h1>',
                          html)
예제 #9
0
def regress_samples(output_dir: str, table: pd.DataFrame,
                    metadata: qiime2.MetadataCategory,
                    test_size: float=defaults['test_size'],
                    step: float=defaults['step'],
                    cv: int=defaults['cv'], random_state: int=None,
                    n_jobs: int=defaults['n_jobs'],
                    n_estimators: int=defaults['n_estimators'],
                    estimator: str='RandomForestRegressor',
                    optimize_feature_selection: bool=False,
                    stratify: str=False, parameter_tuning: bool=False) -> None:

    # extract category name from MetadataCategory
    category = metadata.to_series().name

    # disable feature selection for unsupported estimators
    optimize_feature_selection, calc_feature_importance = \
        _disable_feature_selection(estimator, optimize_feature_selection)

    # specify parameters and distributions to sample from for parameter tuning
    estimator, param_dist, parameter_tuning = _set_parameters_and_estimator(
        estimator, table, metadata, category, n_estimators, n_jobs, cv,
        random_state, parameter_tuning, classification=True)

    estimator, cm, accuracy, importances = split_optimize_classify(
        table, metadata, category, estimator, output_dir,
        test_size=test_size, step=step, cv=cv, random_state=random_state,
        n_jobs=n_jobs, optimize_feature_selection=optimize_feature_selection,
        parameter_tuning=parameter_tuning, param_dist=param_dist,
        calc_feature_importance=calc_feature_importance,
        scoring=mean_squared_error, stratify=stratify, classification=False)

    _visualize(output_dir, estimator, cm, accuracy, importances,
               optimize_feature_selection)
예제 #10
0
    def test_variable_length_barcodes(self):
        metadata = MetadataCategory(
            pd.Series(['AAAAA', 'CCCCCC', 'GGGG'],
                      index=['sample_a', 'sample_b', 'sample_c'],
                      name='Barcode'))
        muxed_sequences_fp = self.get_data_path('variable_length.fastq.gz')
        muxed_sequences = Artifact.import_data(
            'MultiplexedSingleEndBarcodeInSequence', muxed_sequences_fp)

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(muxed_sequences, metadata)

        # This test should yield the same results as test_typical, above, just
        # with variable length barcodes
        self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
        self.assert_untrimmed_results(b'', obs_untrimmed_art)
예제 #11
0
파일: _pfdr.py 프로젝트: mortonjt/q2-pfdr
def permutation_fdr(table: pd.DataFrame,
                    metadata: qiime2.MetadataCategory,
                    statistical_test: str = 'meandiff',
                    transform_function: str = 'log',
                    alpha: float = 0.05,
                    permutations: int = 1000) -> pd.Series:
    # See q2-composition for more details
    # https://github.com/qiime2/q2-composition/blob/master/q2_composition/_ancom.py

    # TODO : Consider renaming the functions to match q2-composition

    metadata_series = metadata.to_series()[table.index]
    # Make sure that metadata and table match up
    reject_idx = _pfdr(table.values.T, metadata_series.values,
                       statistical_test, transform_function, alpha,
                       permutations)
    return reject_idx
예제 #12
0
    def test_error_tolerance_filtering(self):
        metadata = MetadataCategory(
            pd.Series(['AAAG', 'CCCC'],
                      index=['sample_a', 'sample_b'],
                      name='Barcode'))

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata)

        # sample_a is dropped because of a substitution error (AAAA vs AAAG)
        exp_samples_and_barcodes = pd.Series(['CCCC'], index=['sample_b'])
        self.assert_demux_results(exp_samples_and_barcodes, obs_demuxed_art)
        self.assert_untrimmed_results(
            b'@id1\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n'
            b'@id3\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n'
            b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n', obs_untrimmed_art)
예제 #13
0
    def test_extra_barcode_in_metadata(self):
        metadata = MetadataCategory(
            pd.Series(['AAAA', 'CCCC', 'GGGG', 'TTTT'],
                      index=['sample_a', 'sample_b', 'sample_c', 'sample_d'],
                      name='Barcode'))

        with redirected_stdio(stderr=os.devnull):
            obs_demuxed_art, obs_untrimmed_art = \
                self.demux_single_fn(self.muxed_sequences, metadata)

        # TTTT/sample_d shouldn't be in the demuxed results, because there
        # were no reads with that barcode present
        exp_samples_and_barcodes = pd.Series(
            ['AAAA', 'CCCC', 'GGGG'],
            index=['sample_a', 'sample_b', 'sample_c'])
        self.assert_demux_results(exp_samples_and_barcodes, obs_demuxed_art)
        # obs_untrimmed should be empty, since everything matched
        self.assert_untrimmed_results(b'', obs_untrimmed_art)
예제 #14
0
def demux_single(seqs: MultiplexedSingleEndBarcodeInSequenceDirFmt,
                 barcodes: qiime2.MetadataCategory,
                 error_tolerance: float=0.1) -> \
                    (CasavaOneEightSingleLanePerSampleDirFmt,
                     MultiplexedSingleEndBarcodeInSequenceDirFmt):

    barcodes = barcodes.to_series()
    per_sample_sequences = CasavaOneEightSingleLanePerSampleDirFmt()
    untrimmed = MultiplexedSingleEndBarcodeInSequenceDirFmt()

    _write_empty_fastq_to_mux_barcode_in_seq_fmt(untrimmed)

    with tempfile.NamedTemporaryFile() as barcode_fasta:
        _write_barcode_fasta(barcodes, barcode_fasta)
        cmd = _build_demux_command(seqs, barcode_fasta, per_sample_sequences,
                                   untrimmed, error_tolerance)
        run_command(cmd)

    _rename_files(per_sample_sequences, barcodes)
    muxed = len(list(per_sample_sequences.sequences.iter_views(FastqGzFormat)))
    if muxed == 0:
        raise ValueError('No samples were demultiplexed.')

    return per_sample_sequences, untrimmed
def heatmap(output_dir,
            table: pd.DataFrame,
            metadata: qiime2.MetadataCategory = None,
            normalize: bool = True,
            title: str = None,
            metric: str = 'euclidean',
            method: str = 'average',
            cluster: str = 'both',
            color_scheme: str = 'rocket') -> None:
    if table.empty:
        raise ValueError('Cannot visualize an empty table.')

    # Validation
    if metadata is not None:
        table = _munge_metadata(metadata.to_series(), table, cluster)

    cbar_label = 'frequency'
    if normalize:
        table = table.apply(lambda x: np.log10(x + 1))
        cbar_label = 'log10 frequency'

    # Hard-coded values for reasonable plots
    scaletron, labelsize, dpi = 50, 8, 100
    sns.set(
        rc={
            'xtick.labelsize': labelsize,
            'ytick.labelsize': labelsize,
            'figure.dpi': dpi
        })
    width, height = table.shape[1] / scaletron, table.shape[0] / scaletron

    heatmap_plot = sns.clustermap(table,
                                  method=method,
                                  metric=metric,
                                  **_clustering_map[cluster],
                                  cmap=color_scheme,
                                  xticklabels=True,
                                  yticklabels=True,
                                  cbar_kws={'label': cbar_label})
    if title is not None:
        heatmap_plot.fig.suptitle(title)

    hm = heatmap_plot.ax_heatmap.get_position()
    cbar = heatmap_plot.cax.get_position()
    row = heatmap_plot.ax_row_dendrogram.get_position()
    col = heatmap_plot.ax_col_dendrogram.get_position()

    # Resize the plot to set cell aspect-ratio to square
    heatmap_plot.ax_heatmap.set_position([hm.x0, hm.y0, width, height])
    heatmap_plot.cax.set_position(
        [cbar.x0, hm.y0 + height, cbar.width, cbar.height])
    heatmap_plot.ax_row_dendrogram.set_position(
        [row.x0, row.y0, row.width, height])
    heatmap_plot.ax_col_dendrogram.set_position(
        [col.x0, hm.y0 + height, width, col.height])

    # https://stackoverflow.com/a/34697479/3776794
    plt.setp(heatmap_plot.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
    plt.setp(heatmap_plot.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)

    for ext in ['png', 'svg']:
        img_fp = os.path.join(output_dir, 'feature-table-heatmap.%s' % ext)
        heatmap_plot.savefig(img_fp)

    index_fp = os.path.join(TEMPLATES, 'index.html')
    q2templates.render(index_fp, output_dir, context={'normalize': normalize})
예제 #16
0
def beta_group_significance(output_dir: str,
                            distance_matrix: skbio.DistanceMatrix,
                            metadata: qiime2.MetadataCategory,
                            method: str='permanova',
                            pairwise: bool=False,
                            permutations: int=999) -> None:
    try:
        beta_group_significance_fn = _beta_group_significance_fns[method]
    except KeyError:
        raise ValueError('Unknown group significance method %s. The available '
                         'options are %s.' %
                         (method,
                          ', '.join(_beta_group_significance_fns)))

    # Cast metadata to numeric (if applicable), which gives better sorting
    # in boxplots. Then filter any samples that are not in the distance matrix,
    # and drop samples with have no data for this metadata
    # category, including those with empty strings as values.
    metadata = pd.to_numeric(metadata.to_series(), errors='ignore')
    metadata = metadata.loc[list(distance_matrix.ids)]
    metadata = metadata.replace(r'', numpy.nan).dropna()

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(metadata.index)
    filtered_dm_length = distance_matrix.shape[0]

    # Run the significance test
    result = beta_group_significance_fn(distance_matrix, metadata,
                                        permutations=permutations)

    # Generate distance boxplots
    sns.set_style("white")
    # Identify the groups, then compute the within group distances and the
    # between group distances, and generate one boxplot per group.
    # groups will be an OrderedDict mapping group id to the sample ids in that
    # group. The order is used both on the x-axis, and in the layout of the
    # boxplots in the visualization.
    groupings = collections.OrderedDict(
        [(id, list(series.index))
         for id, series in sorted(metadata.groupby(metadata))])

    for group_id in groupings:
        group_distances, x_ticklabels = \
            _get_distance_boxplot_data(distance_matrix, group_id, groupings)

        ax = sns.boxplot(data=group_distances, flierprops={
            'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5,
            'alpha': 0.5})
        ax.set_xticklabels(x_ticklabels, rotation=90)
        ax.set_xlabel('Group')
        ax.set_ylabel('Distance')
        ax.set_title('Distances to %s' % group_id)
        # change the color of the boxes to white
        for box in ax.artists:
            box.set_facecolor('white')
        sns.despine()
        plt.tight_layout()
        fig = ax.get_figure()
        fig.savefig(os.path.join(output_dir, '%s-boxplots.png' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.clear()

    result_html = result.to_frame().to_html(classes=("table table-striped "
                                                     "table-hover"))
    result_html = result_html.replace('border="1"', 'border="0"')

    if pairwise:
        pairwise_results = []
        for group1_id, group2_id in itertools.combinations(groupings, 2):
            pairwise_result = \
                _get_pairwise_group_significance_stats(
                    distance_matrix=distance_matrix,
                    group1_id=group1_id,
                    group2_id=group2_id,
                    groupings=groupings,
                    metadata=metadata,
                    beta_group_significance_fn=beta_group_significance_fn,
                    permutations=permutations)
            pairwise_results.append([group1_id,
                                     group2_id,
                                     pairwise_result['sample size'],
                                     permutations,
                                     pairwise_result['test statistic'],
                                     pairwise_result['p-value']])
        columns = ['Group 1', 'Group 2', 'Sample size', 'Permutations',
                   result['test statistic name'], 'p-value']
        pairwise_results = pd.DataFrame(pairwise_results, columns=columns)
        pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True)
        pairwise_results['q-value'] = multipletests(
            pairwise_results['p-value'], method='fdr_bh')[1]
        pairwise_results.sort_index(inplace=True)
        pairwise_path = os.path.join(
            output_dir, '%s-pairwise.csv' % method)
        pairwise_results.to_csv(pairwise_path)

        pairwise_results_html = pairwise_results.to_html(
            classes=("table table-striped table-hover"))
        pairwise_results_html = pairwise_results_html.replace(
            'border="1"', 'border="0"')
    else:
        pairwise_results_html = None

    index = os.path.join(
        TEMPLATES, 'beta_group_significance_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'method': method,
        'groupings': groupings,
        'result': result_html,
        'pairwise_results': pairwise_results_html
    })
예제 #17
0
def beta_correlation(output_dir: str,
                     distance_matrix: skbio.DistanceMatrix,
                     metadata: qiime2.MetadataCategory,
                     method: str='spearman',
                     permutations: int=999) -> None:
    test_statistics = {'spearman': 'rho', 'pearson': 'r'}
    alt_hypothesis = 'two-sided'
    try:
        metadata = pd.to_numeric(metadata.to_series(), errors='raise')
    except ValueError as e:
        raise ValueError('Only numeric data can be used with the Mantel test. '
                         'Non-numeric data was encountered in the sample '
                         'metadata. Orignal error message follows:\n%s' %
                         str(e))

    initial_metadata_length = len(metadata)
    metadata = metadata.loc[list(distance_matrix.ids)]
    metadata = metadata.replace(r'', numpy.nan).dropna()
    filtered_metadata_length = len(metadata)

    ids_with_missing_metadata = set(distance_matrix.ids) - set(metadata.index)
    if len(ids_with_missing_metadata) > 0:
        raise ValueError('All samples in distance matrix must be present '
                         'and contain data in the sample metadata. The '
                         'following samples were present in the distance '
                         'matrix, but were missing from the sample metadata '
                         'or had no data: %s' %
                         ', '.join(ids_with_missing_metadata))

    metadata_distances = _metadata_distance(metadata)
    r, p, n = skbio.stats.distance.mantel(
        distance_matrix, metadata_distances, method=method,
        permutations=permutations, alternative=alt_hypothesis, strict=True)

    result = pd.Series([method.title(), n, permutations, alt_hypothesis,
                        metadata.name, r, p],
                       index=['Method', 'Sample size', 'Permutations',
                              'Alternative hypothesis', 'Metadata category',
                              '%s %s' % (method.title(),
                                         test_statistics[method]),
                              'p-value'],
                       name='Mantel test results')
    result_html = result.to_frame().to_html(classes=("table table-striped "
                                                     "table-hover"))
    result_html = result_html.replace('border="1"', 'border="0"')

    scatter_data = []
    for id1, id2 in itertools.combinations(distance_matrix.ids, 2):
        scatter_data.append((distance_matrix[id1, id2],
                             metadata_distances[id1, id2]))
    x = 'Input distance'
    y = 'Euclidean distance of\n%s' % metadata.name
    scatter_data = pd.DataFrame(scatter_data, columns=[x, y])
    fig = sns.regplot(x=x, y=y, data=scatter_data, fit_reg=False).get_figure()
    fig.savefig(os.path.join(output_dir, 'beta-correlation-scatter.png'))
    fig.savefig(os.path.join(output_dir, 'beta-correlation-scatter.pdf'))

    index = os.path.join(
        TEMPLATES, 'beta_correlation_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_metadata_length': initial_metadata_length,
        'filtered_metadata_length': filtered_metadata_length,
        'result': result_html
    })
예제 #18
0
def beta_correlation(output_dir: str,
                     distance_matrix: skbio.DistanceMatrix,
                     metadata: qiime2.MetadataCategory,
                     method: str = 'spearman',
                     permutations: int = 999) -> None:
    test_statistics = {'spearman': 'rho', 'pearson': 'r'}
    alt_hypothesis = 'two-sided'
    try:
        metadata = pd.to_numeric(metadata.to_series(), errors='raise')
    except ValueError as e:
        raise ValueError('Only numeric data can be used with the Mantel test. '
                         'Non-numeric data was encountered in the sample '
                         'metadata. Orignal error message follows:\n%s' %
                         str(e))

    initial_metadata_length = len(metadata)
    metadata = metadata.loc[list(distance_matrix.ids)]
    metadata = metadata.replace(r'', numpy.nan).dropna()
    filtered_metadata_length = len(metadata)

    ids_with_missing_metadata = set(distance_matrix.ids) - set(metadata.index)
    if len(ids_with_missing_metadata) > 0:
        raise ValueError('All samples in distance matrix must be present '
                         'and contain data in the sample metadata. The '
                         'following samples were present in the distance '
                         'matrix, but were missing from the sample metadata '
                         'or had no data: %s' %
                         ', '.join(ids_with_missing_metadata))

    metadata_distances = _metadata_distance(metadata)
    r, p, n = skbio.stats.distance.mantel(distance_matrix,
                                          metadata_distances,
                                          method=method,
                                          permutations=permutations,
                                          alternative=alt_hypothesis,
                                          strict=True)

    result = pd.Series(
        [method.title(), n, permutations, alt_hypothesis, metadata.name, r, p],
        index=[
            'Method', 'Sample size', 'Permutations', 'Alternative hypothesis',
            'Metadata category',
            '%s %s' % (method.title(), test_statistics[method]), 'p-value'
        ],
        name='Mantel test results')
    result_html = result.to_frame().to_html(classes=("table table-striped "
                                                     "table-hover"))
    result_html = result_html.replace('border="1"', 'border="0"')

    scatter_data = []
    for id1, id2 in itertools.combinations(distance_matrix.ids, 2):
        scatter_data.append(
            (distance_matrix[id1, id2], metadata_distances[id1, id2]))
    x = 'Input distance'
    y = 'Euclidean distance of\n%s' % metadata.name
    plt.figure()
    scatter_data = pd.DataFrame(scatter_data, columns=[x, y])
    sns.regplot(x=x, y=y, data=scatter_data, fit_reg=False)
    plt.savefig(os.path.join(output_dir, 'beta-correlation-scatter.png'))
    plt.savefig(os.path.join(output_dir, 'beta-correlation-scatter.pdf'))

    index = os.path.join(TEMPLATES, 'beta_correlation_assets', 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'initial_metadata_length': initial_metadata_length,
                           'filtered_metadata_length':
                           filtered_metadata_length,
                           'result': result_html
                       })
예제 #19
0
def beta_group_significance(output_dir: str,
                            distance_matrix: skbio.DistanceMatrix,
                            metadata: qiime2.MetadataCategory,
                            method: str='permanova',
                            permutations: int=999) -> None:
    try:
        beta_group_significance_fn = _beta_group_significance_fns[method]
    except KeyError:
        raise ValueError('Unknown group significance method %s. The available '
                         'options are %s.' %
                         (method,
                          ', '.join(_beta_group_significance_fns)))

    # Cast metadata to numeric (if applicable), which gives better sorting
    # in boxplots. Then filter any samples that are not in the distance matrix,
    # and drop samples with have no data for this metadata
    # category, including those with empty strings as values.
    metadata = pd.to_numeric(metadata.to_series(), errors='ignore')
    metadata = metadata.loc[list(distance_matrix.ids)]
    metadata = metadata.replace(r'', numpy.nan).dropna()

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(metadata.index)
    filtered_dm_length = distance_matrix.shape[0]

    # Run the significance test
    result = beta_group_significance_fn(distance_matrix, metadata,
                                        permutations=permutations)

    # Generate distance boxplots
    sns.set_style("white")
    # Identify the groups, then compute the within group distances and the
    # between group distances, and generate one boxplot per group.
    # groups will be an OrderedDict mapping group id to the sample ids in that
    # group. The order is used both on the x-axis, and in the layout of the
    # boxplots in the visualization.
    groupings = collections.OrderedDict(
        [(id, list(series.index))
         for id, series in sorted(metadata.groupby(metadata))])

    for group_id in groupings:
        group_distances, x_ticklabels = \
            _get_distance_boxplot_data(distance_matrix, group_id, groupings)

        ax = sns.boxplot(data=group_distances, flierprops={
            'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5,
            'alpha': 0.5})
        ax.set_xticklabels(x_ticklabels, rotation=90)
        ax.set_xlabel('Group')
        ax.set_ylabel('Distance')
        ax.set_title('Distances to %s' % group_id)
        # change the color of the boxes to white
        for box in ax.artists:
            box.set_facecolor('white')
        sns.despine()
        plt.tight_layout()
        fig = ax.get_figure()
        fig.savefig(os.path.join(output_dir, '%s-boxplots.png' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.clear()

    result = result.to_frame().to_html(classes="table table-striped "
                                       "table-hover")
    result = result.replace('border="1"', 'border="0"')
    index = os.path.join(
        TEMPLATES, 'beta_group_significance_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'method': method,
        'groupings': groupings,
        'result': result
    })
예제 #20
0
def ancom(output_dir: str,
          table: pd.DataFrame,
          metadata: qiime2.MetadataCategory,
          statistical_test: str = 'f_oneway',
          transform_function: str = 'clr',
          difference_function: str = None) -> None:

    index_fp = os.path.join(output_dir, 'index.html')

    if statistical_test not in statistical_tests():
        raise ValueError("Unknown statistical test: %s" % statistical_test)

    metadata_series = metadata.to_series()
    metadata_series = metadata_series.loc[table.index]
    if pd.isnull(metadata_series).any():
        missing_data_sids = metadata_series[pd.isnull(metadata_series)].index
        missing_data_sids = ', '.join(missing_data_sids)
        raise ValueError('Metadata category is missing values for the '
                         'following samples. Values need to be added for '
                         'these samples, or the samples need to be removed '
                         'from the table. %s' % missing_data_sids)

    statistical_test = _sig_tests[statistical_test]
    ancom_results = skbio_ancom(table,
                                metadata_series,
                                significance_test=statistical_test)
    # scikit-bio 0.4.2 returns a single tuple from ancom, and scikit-bio 0.5.0
    # returns two tuples. We want to support both scikit-bio versions, so we
    # tuplize ancom_result to support both. Similarly, the "reject" column
    # was renamed in scikit-bio 0.5.0, so we apply a rename here (which does
    # nothing if a column called "reject" isn't found).
    ancom_results = qiime2.core.util.tuplize(ancom_results)
    ancom_results[0].sort_values(by='W', ascending=False)
    ancom_results[0].rename(columns={'reject': 'Reject null hypothesis'},
                            inplace=True)

    ancom_results[0].to_csv(os.path.join(output_dir, 'ancom.csv'),
                            header=True,
                            index=True)

    html = _volcanoplot(output_dir, table, metadata, ancom_results[0],
                        transform_function, difference_function)

    significant_features = ancom_results[0][ancom_results[0]
                                            ['Reject null hypothesis']]

    with open(index_fp, 'w') as index_f:
        index_f.write('<html><body>\n')
        index_f.write('<h1>ANCOM statistical results</h1>\n')
        index_f.write('<a href="ancom.csv">Download as CSV</a><br>\n')
        index_f.write(
            q2templates.df_to_html(significant_features['W'].to_frame(),
                                   border=None,
                                   classes=None))
        if len(ancom_results) == 2:
            ancom_results[1].to_csv(os.path.join(output_dir,
                                                 'percent-abundances.csv'),
                                    header=True,
                                    index=True)
            index_f.write(('<h1>Percentile abundances of features '
                           'by group</h1>\n'))
            index_f.write(('<a href="percent-abundances.csv">'
                           'Download as CSV</a><br>\n'))
            index_f.write(
                q2templates.df_to_html(
                    ancom_results[1].loc[significant_features.index],
                    border=None,
                    classes=None))
        index_f.write(html)
        index_f.write('</body></html>\n')