Exemplo n.º 1
0
def sample_longitudinal(dates: qiime2.CategoricalMetadataColumn,
                        context_seqs: DNAFASTAFormat = None,
                        start_date: str = None,
                        samples_per_interval: int = 7,
                        days_per_interval: int = 7,
                        seed: int = None) -> IDSelection:

    window_size = '%dD' % days_per_interval

    if context_seqs is not None:
        # filter dates to only include the ids that sequence data is
        # available for
        ids_to_include = ids_from_fasta(str(context_seqs))
        dates = dates.filter_ids(ids_to_include)

    dt_series = pd.to_datetime(dates.to_series(), errors='coerce')
    df = pd.DataFrame({'ids': dates.to_series().index}, index=dt_series)

    if start_date is not None:
        filter_before = pd.Timestamp(start_date)
        df = df.iloc[np.where(dt_series >= filter_before)]
        if filter_before not in df.index:
            # this will be stripped in _sample_group::_sampler
            # the purpose is to force Pandas to begin the window at this
            # time instead of the first observation (by making NaN the first
            # observation)
            df.loc[filter_before] = float('nan')

    grouped = df.groupby(pd.Grouper(freq=window_size,
                                    convention='start',
                                    closed='left'),
                         group_keys=False)
    filtered_df = grouped.apply(_sample_group(samples_per_interval, seed))

    df = df.dropna(axis=0)
    selection = pd.Series(False, index=dates.to_series().index)
    selection[filtered_df['ids']] = True

    md = qiime2.Metadata(dates.to_dataframe())
    return IDSelection(selection, md, 'sample_longitudinal')
Exemplo n.º 2
0
def beta_group_significance(output_dir: str,
                            distance_matrix: skbio.DistanceMatrix,
                            metadata: qiime2.CategoricalMetadataColumn,
                            method: str = 'permanova',
                            pairwise: bool = False,
                            permutations: int = 999) -> None:
    try:
        beta_group_significance_fn = _beta_group_significance_fns[method]
    except KeyError:
        raise ValueError('Unknown group significance method %s. The available '
                         'options are %s.' %
                         (method, ', '.join(_beta_group_significance_fns)))

    # Filter metadata to only include IDs present in the distance matrix.
    # Also ensures every distance matrix ID is present in the metadata.
    metadata = metadata.filter_ids(distance_matrix.ids)
    metadata = metadata.drop_missing_values()

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata due to missing values, and keep track of how many samples
    # survived the filtering so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(metadata.ids)
    filtered_dm_length = distance_matrix.shape[0]

    metadata = metadata.to_series()

    # Run the significance test
    result = beta_group_significance_fn(distance_matrix,
                                        metadata,
                                        permutations=permutations)

    # Generate distance boxplots
    sns.set_style('white')
    # Identify the groups, then compute the within group distances and the
    # between group distances, and generate one boxplot per group.
    # groups will be an OrderedDict mapping group id to the sample ids in that
    # group. The order is used both on the x-axis, and in the layout of the
    # boxplots in the visualization.
    # TODO: update to use a grouping API and natsort API on
    # CategoricalMetadataColumn, if those become available.
    groupings = collections.OrderedDict([
        (id, list(series.index))
        for id, series in natsorted(metadata.groupby(metadata))
    ])

    pairs_summary = pd.DataFrame(
        columns=['SubjectID1', 'SubjectID2', 'Group1', 'Group2', 'Distance'])
    for group_id in groupings:
        group_distances, x_ticklabels, group_pairs_summary = \
            _get_distance_boxplot_data(distance_matrix, group_id, groupings)

        group_pairs_summary = pd.DataFrame(group_pairs_summary,
                                           columns=[
                                               'SubjectID1', 'SubjectID2',
                                               'Group1', 'Group2', 'Distance'
                                           ])

        pairs_summary = pd.concat([pairs_summary, group_pairs_summary])

        ax = sns.boxplot(data=group_distances,
                         flierprops={
                             'marker': 'o',
                             'markeredgecolor': 'black',
                             'markeredgewidth': 0.5,
                             'alpha': 0.5
                         })
        ax.set_xticklabels(x_ticklabels, rotation=90)
        ax.set_xlabel('Group')
        ax.set_ylabel('Distance')
        ax.set_title('Distances to %s' % group_id)
        # change the color of the boxes to white
        for box in ax.artists:
            box.set_facecolor('white')
        sns.despine()
        plt.tight_layout()
        fig = ax.get_figure()
        fig.savefig(
            os.path.join(output_dir, '%s-boxplots.png' %
                         urllib.parse.quote(str(group_id))))
        fig.savefig(
            os.path.join(output_dir, '%s-boxplots.pdf' %
                         urllib.parse.quote(str(group_id))))
        fig.clear()

    pairs_summary.to_csv(os.path.join(output_dir, 'raw_data.tsv'), sep='\t')

    result_html = q2templates.df_to_html(result.to_frame())

    if pairwise:
        pairwise_results = []
        for group1_id, group2_id in itertools.combinations(groupings, 2):
            pairwise_result = \
                _get_pairwise_group_significance_stats(
                    distance_matrix=distance_matrix,
                    group1_id=group1_id,
                    group2_id=group2_id,
                    groupings=groupings,
                    metadata=metadata,
                    beta_group_significance_fn=beta_group_significance_fn,
                    permutations=permutations)
            pairwise_results.append([
                group1_id, group2_id, pairwise_result['sample size'],
                permutations, pairwise_result['test statistic'],
                pairwise_result['p-value']
            ])
        columns = [
            'Group 1', 'Group 2', 'Sample size', 'Permutations',
            result['test statistic name'], 'p-value'
        ]
        pairwise_results = pd.DataFrame(pairwise_results, columns=columns)
        pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True)
        pairwise_results['q-value'] = multipletests(
            pairwise_results['p-value'], method='fdr_bh')[1]
        pairwise_results.sort_index(inplace=True)
        pairwise_path = os.path.join(output_dir, '%s-pairwise.csv' % method)
        pairwise_results.to_csv(pairwise_path)

        pairwise_results_html = q2templates.df_to_html(pairwise_results)
    else:
        pairwise_results_html = None

    # repartition groupings for rendering
    group_ids = [
        # We have to DOUBLE encode this, as the file/resource name is a literal
        # URI-encoded string, we do this to prevent issues with the filesystem
        # however, as a result, our links need to escape % so that the browser
        # asks for the right escaped name (instead of the original name, which
        # doesn't exist inside the visualization).
        urllib.parse.quote(urllib.parse.quote(k)) for k in groupings.keys()
    ]
    row_count, group_count = 3, len(group_ids)  # Start at three plots per row
    while group_count % row_count != 0:
        row_count = row_count - 1

    group_rows = [
        group_ids[g:g + row_count] for g in range(0, group_count, row_count)
    ]

    index = os.path.join(TEMPLATES, 'beta_group_significance_assets',
                         'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'initial_dm_length': initial_dm_length,
                           'filtered_dm_length': filtered_dm_length,
                           'method': method,
                           'group_rows': group_rows,
                           'bootstrap_group_col_size': int(12 / row_count),
                           'result': result_html,
                           'pairwise_results': pairwise_results_html
                       })
Exemplo n.º 3
0
def ancom(output_dir: str,
          table: pd.DataFrame,
          metadata: qiime2.CategoricalMetadataColumn,
          transform_function: str = 'clr',
          difference_function: str = None) -> None:
    metadata = metadata.filter_ids(table.index)
    if metadata.has_missing_values():
        missing_data_sids = metadata.get_ids(where_values_missing=True)
        missing_data_sids = ', '.join(sorted(missing_data_sids))
        raise ValueError('Metadata column is missing values for the '
                         'following samples. Values need to be added for '
                         'these samples, or the samples need to be removed '
                         'from the table: %s' % missing_data_sids)
    ancom_results = skbio_ancom(table,
                                metadata.to_series(),
                                significance_test=f_oneway)
    ancom_results[0].sort_values(by='W', ascending=False, inplace=True)
    ancom_results[0].rename(columns={'reject': 'Reject null hypothesis'},
                            inplace=True)
    significant_features = ancom_results[0][ancom_results[0]
                                            ['Reject null hypothesis']]

    context = dict()
    if not significant_features.empty:
        context['significant_features'] = q2templates.df_to_html(
            significant_features['W'].to_frame())
        context['percent_abundances'] = q2templates.df_to_html(
            ancom_results[1].loc[significant_features.index])

    metadata = metadata.to_series()
    cats = list(set(metadata))
    transform_function_name = transform_function
    transform_function = _transform_functions[transform_function]
    transformed_table = table.apply(transform_function,
                                    axis=1,
                                    result_type='broadcast')

    if difference_function is None:
        if len(cats) == 2:
            difference_function = 'mean_difference'
        else:  # len(categories) > 2
            difference_function = 'f_statistic'

    _d_func = _difference_functions[difference_function]

    def diff_func(x):
        args = _d_func(*[x[metadata == c] for c in cats])
        if isinstance(args, tuple):
            return args[0]
        else:
            return args

    # effectively doing a groupby operation wrt to the metadata
    fold_change = transformed_table.apply(diff_func, axis=0)
    if not pd.isnull(fold_change).all():
        pre_filtered_ids = set(fold_change.index)
        with pd.option_context('mode.use_inf_as_na', True):
            fold_change = fold_change.dropna(axis=0)
        filtered_ids = pre_filtered_ids - set(fold_change.index)
        filtered_ancom_results = ancom_results[0].drop(labels=filtered_ids)

        volcano_results = pd.DataFrame({
            transform_function_name: fold_change,
            'W': filtered_ancom_results.W
        })
        volcano_results.index.name = 'id'
        volcano_results.to_csv(os.path.join(output_dir, 'data.tsv'),
                               header=True,
                               index=True,
                               sep='\t')
        volcano_results = volcano_results.reset_index(drop=False)

        spec = {
            '$schema':
            'https://vega.github.io/schema/vega/v4.json',
            'width':
            300,
            'height':
            300,
            'data': [{
                'name': 'values',
                'values': volcano_results.to_dict(orient='records')
            }],
            'scales': [{
                'name': 'xScale',
                'domain': {
                    'data': 'values',
                    'field': transform_function_name
                },
                'range': 'width'
            }, {
                'name': 'yScale',
                'domain': {
                    'data': 'values',
                    'field': 'W'
                },
                'range': 'height'
            }],
            'axes': [{
                'scale': 'xScale',
                'orient': 'bottom',
                'title': transform_function_name
            }, {
                'scale': 'yScale',
                'orient': 'left',
                'title': 'W'
            }],
            'marks': [{
                'type': 'symbol',
                'from': {
                    'data': 'values'
                },
                'encode': {
                    'hover': {
                        'fill': {
                            'value': '#FF0000'
                        },
                        'opacity': {
                            'value': 1
                        }
                    },
                    'enter': {
                        'x': {
                            'scale': 'xScale',
                            'field': transform_function_name
                        },
                        'y': {
                            'scale': 'yScale',
                            'field': 'W'
                        }
                    },
                    'update': {
                        'fill': {
                            'value': 'black'
                        },
                        'opacity': {
                            'value': 0.3
                        },
                        'tooltip': {
                            'signal':
                            "{{'title': datum['id'], '{0}': "
                            "datum['{0}'], 'W': datum['W']}}".format(
                                transform_function_name)
                        }
                    }
                }
            }]
        }
        context['vega_spec'] = json.dumps(spec)
        if filtered_ids:
            context['filtered_ids'] = ', '.join(sorted(filtered_ids))

    copy_tree(os.path.join(TEMPLATES, 'ancom'), output_dir)
    ancom_results[0].to_csv(os.path.join(output_dir, 'ancom.tsv'),
                            header=True,
                            index=True,
                            sep='\t')
    ancom_results[1].to_csv(os.path.join(output_dir, 'percent-abundances.tsv'),
                            header=True,
                            index=True,
                            sep='\t')
    index = os.path.join(TEMPLATES, 'ancom', 'index.html')
    q2templates.render(index, output_dir, context=context)
Exemplo n.º 4
0
def ancom(output_dir: str,
          table: pd.DataFrame,
          metadata: qiime2.CategoricalMetadataColumn,
          transform_function: str = 'clr',
          difference_function: str = None) -> None:

    index_fp = os.path.join(output_dir, 'index.html')

    metadata = metadata.filter_ids(table.index)
    if metadata.has_missing_values():
        missing_data_sids = metadata.get_ids(where_values_missing=True)
        missing_data_sids = ', '.join(sorted(missing_data_sids))
        raise ValueError('Metadata column is missing values for the '
                         'following samples. Values need to be added for '
                         'these samples, or the samples need to be removed '
                         'from the table: %s' % missing_data_sids)

    ancom_results = skbio_ancom(table,
                                metadata.to_series(),
                                significance_test=f_oneway)
    # scikit-bio 0.4.2 returns a single tuple from ancom, and scikit-bio 0.5.0
    # returns two tuples. We want to support both scikit-bio versions, so we
    # tuplize ancom_result to support both. Similarly, the "reject" column
    # was renamed in scikit-bio 0.5.0, so we apply a rename here (which does
    # nothing if a column called "reject" isn't found).
    ancom_results = qiime2.core.util.tuplize(ancom_results)
    ancom_results[0].sort_values(by='W', ascending=False)
    ancom_results[0].rename(columns={'reject': 'Reject null hypothesis'},
                            inplace=True)

    ancom_results[0].to_csv(os.path.join(output_dir, 'ancom.csv'),
                            header=True,
                            index=True)

    html = _volcanoplot(output_dir, table, metadata, ancom_results[0],
                        transform_function, difference_function)

    significant_features = ancom_results[0][ancom_results[0]
                                            ['Reject null hypothesis']]
    significant_features_present = not significant_features.empty
    insignificant_div = ('<div>No significant features identified!</div>')

    with open(index_fp, 'w') as index_f:
        index_f.write('<html>\n')
        if html is not None:
            index_f.write('<head>\n')
            index_f.write(INLINE.render())
            index_f.write('</head>\n')
        index_f.write('<body>\n')
        index_f.write('<h1>ANCOM statistical results</h1>\n')
        index_f.write('<a href="ancom.csv">Download complete table as CSV</a>'
                      '<br>\n')
        if significant_features_present:
            index_f.write(
                q2templates.df_to_html(significant_features['W'].to_frame(),
                                       border=None,
                                       classes=None))
        else:
            index_f.write(insignificant_div)
        if len(ancom_results) == 2:
            ancom_results[1].to_csv(os.path.join(output_dir,
                                                 'percent-abundances.csv'),
                                    header=True,
                                    index=True)
            index_f.write(('<h1>Percentile abundances of features '
                           'by group</h1>\n'))
            index_f.write(('<a href="percent-abundances.csv">'
                           'Download complete table as CSV</a><br>\n'))
            if significant_features_present:
                index_f.write(
                    q2templates.df_to_html(
                        ancom_results[1].loc[significant_features.index],
                        border=None,
                        classes=None))
            else:
                index_f.write(insignificant_div)
        if html is not None:
            index_f.write(html[1])
            index_f.write(html[0])
        else:
            index_f.write('<p>Unable to generate volcano plot, please check '
                          'the ANCOM statistical results (above).</p>\n')
        index_f.write('</body></html>\n')
Exemplo n.º 5
0
def beta_group_significance(output_dir: str,
                            distance_matrix: skbio.DistanceMatrix,
                            metadata: qiime2.CategoricalMetadataColumn,
                            method: str = 'permanova',
                            pairwise: bool = False,
                            permutations: int = 999) -> None:
    try:
        beta_group_significance_fn = _beta_group_significance_fns[method]
    except KeyError:
        raise ValueError('Unknown group significance method %s. The available '
                         'options are %s.' %
                         (method,
                          ', '.join(_beta_group_significance_fns)))

    # Filter metadata to only include IDs present in the distance matrix.
    # Also ensures every distance matrix ID is present in the metadata.
    metadata = metadata.filter_ids(distance_matrix.ids)
    metadata = metadata.drop_missing_values()

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata due to missing values, and keep track of how many samples
    # survived the filtering so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(metadata.ids)
    filtered_dm_length = distance_matrix.shape[0]

    metadata = metadata.to_series()

    # Run the significance test
    result = beta_group_significance_fn(distance_matrix, metadata,
                                        permutations=permutations)

    # Generate distance boxplots
    sns.set_style('white')
    # Identify the groups, then compute the within group distances and the
    # between group distances, and generate one boxplot per group.
    # groups will be an OrderedDict mapping group id to the sample ids in that
    # group. The order is used both on the x-axis, and in the layout of the
    # boxplots in the visualization.
    # TODO: update to use a grouping API and natsort API on
    # CategoricalMetadataColumn, if those become available.
    groupings = collections.OrderedDict(
        [(id, list(series.index))
         for id, series in natsorted(metadata.groupby(metadata))])

    pairs_summary = pd.DataFrame(columns=['SubjectID1', 'SubjectID2', 'Group1',
                                          'Group2', 'Distance'])
    for group_id in groupings:
        group_distances, x_ticklabels, group_pairs_summary = \
            _get_distance_boxplot_data(distance_matrix, group_id, groupings)

        group_pairs_summary = pd.DataFrame(
            group_pairs_summary, columns=['SubjectID1', 'SubjectID2',
                                          'Group1', 'Group2', 'Distance'])

        pairs_summary = pd.concat([pairs_summary, group_pairs_summary])

        ax = sns.boxplot(data=group_distances, flierprops={
            'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5,
            'alpha': 0.5})
        ax.set_xticklabels(x_ticklabels, rotation=90)
        ax.set_xlabel('Group')
        ax.set_ylabel('Distance')
        ax.set_title('Distances to %s' % group_id)
        # change the color of the boxes to white
        for box in ax.artists:
            box.set_facecolor('white')
        sns.despine()
        plt.tight_layout()
        fig = ax.get_figure()
        fig.savefig(os.path.join(output_dir, '%s-boxplots.png' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.clear()

    pairs_summary.to_csv(os.path.join(output_dir, 'raw_data.tsv'), sep='\t')

    result_html = q2templates.df_to_html(result.to_frame())

    if pairwise:
        pairwise_results = []
        for group1_id, group2_id in itertools.combinations(groupings, 2):
            pairwise_result = \
                _get_pairwise_group_significance_stats(
                    distance_matrix=distance_matrix,
                    group1_id=group1_id,
                    group2_id=group2_id,
                    groupings=groupings,
                    metadata=metadata,
                    beta_group_significance_fn=beta_group_significance_fn,
                    permutations=permutations)
            pairwise_results.append([group1_id,
                                     group2_id,
                                     pairwise_result['sample size'],
                                     permutations,
                                     pairwise_result['test statistic'],
                                     pairwise_result['p-value']])
        columns = ['Group 1', 'Group 2', 'Sample size', 'Permutations',
                   result['test statistic name'], 'p-value']
        pairwise_results = pd.DataFrame(pairwise_results, columns=columns)
        pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True)
        pairwise_results['q-value'] = multipletests(
            pairwise_results['p-value'], method='fdr_bh')[1]
        pairwise_results.sort_index(inplace=True)
        pairwise_path = os.path.join(
            output_dir, '%s-pairwise.csv' % method)
        pairwise_results.to_csv(pairwise_path)

        pairwise_results_html = q2templates.df_to_html(pairwise_results)
    else:
        pairwise_results_html = None

    # repartition groupings for rendering
    group_ids = list(groupings.keys())
    row_count, group_count = 3, len(group_ids)  # Start at three plots per row
    while group_count % row_count != 0:
        row_count = row_count - 1

    group_rows = [group_ids[g:g+row_count] for g in range(0, group_count,
                                                          row_count)]

    index = os.path.join(
        TEMPLATES, 'beta_group_significance_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'method': method,
        'group_rows': group_rows,
        'bootstrap_group_col_size': int(12 / row_count),
        'result': result_html,
        'pairwise_results': pairwise_results_html
    })
Exemplo n.º 6
0
def ancom(output_dir: str,
          table: pd.DataFrame,
          metadata: qiime2.CategoricalMetadataColumn,
          transform_function: str = 'clr',
          difference_function: str = None) -> None:
    metadata = metadata.filter_ids(table.index)
    if metadata.has_missing_values():
        missing_data_sids = metadata.get_ids(where_values_missing=True)
        missing_data_sids = ', '.join(sorted(missing_data_sids))
        raise ValueError('Metadata column is missing values for the '
                         'following samples. Values need to be added for '
                         'these samples, or the samples need to be removed '
                         'from the table: %s' % missing_data_sids)
    ancom_results = skbio_ancom(table,
                                metadata.to_series(),
                                significance_test=f_oneway)
    ancom_results[0].sort_values(by='W', ascending=False, inplace=True)
    ancom_results[0].rename(columns={'reject': 'Reject null hypothesis'},
                            inplace=True)
    significant_features = ancom_results[0][
        ancom_results[0]['Reject null hypothesis']]

    context = dict()
    if not significant_features.empty:
        context['significant_features'] = q2templates.df_to_html(
            significant_features['W'].to_frame())
        context['percent_abundances'] = q2templates.df_to_html(
            ancom_results[1].loc[significant_features.index])

    metadata = metadata.to_series()
    cats = list(set(metadata))
    transform_function_name = transform_function
    transform_function = _transform_functions[transform_function]
    transformed_table = table.apply(
        transform_function, axis=1, result_type='broadcast')

    if difference_function is None:
        if len(cats) == 2:
            difference_function = 'mean_difference'
        else:  # len(categories) > 2
            difference_function = 'f_statistic'

    _d_func = _difference_functions[difference_function]

    def diff_func(x):
        args = _d_func(*[x[metadata == c] for c in cats])
        if isinstance(args, tuple):
            return args[0]
        else:
            return args
    # effectively doing a groupby operation wrt to the metadata
    fold_change = transformed_table.apply(diff_func, axis=0)
    if not pd.isnull(fold_change).all():
        volcano_results = pd.DataFrame({transform_function_name: fold_change,
                                        'W': ancom_results[0].W})
        volcano_results = volcano_results.reset_index(drop=False)

        spec = {
            '$schema': 'https://vega.github.io/schema/vega/v4.json',
            'width': 300,
            'height': 300,
            'data': [
                {'name': 'values',
                 'values': volcano_results.to_dict(orient='records')}],
            'scales': [
                {'name': 'xScale',
                 'domain': {'data': 'values',
                            'field': transform_function_name},
                 'range': 'width'},
                {'name': 'yScale',
                 'domain': {'data': 'values', 'field': 'W'},
                 'range': 'height'}],
            'axes': [
                {'scale': 'xScale', 'orient': 'bottom',
                 'title': transform_function_name},
                {'scale': 'yScale', 'orient': 'left', 'title': 'W'}],
            'marks': [
              {'type': 'symbol',
               'from': {'data': 'values'},
               'encode': {
                   'hover': {
                       'fill': {'value': '#FF0000'},
                       'opacity': {'value': 1}},
                   'enter': {
                       'x': {'scale': 'xScale',
                             'field': transform_function_name},
                       'y': {'scale': 'yScale', 'field': 'W'}},
                   'update': {
                       'fill': {'value': 'black'},
                       'opacity': {'value': 0.3},
                       'tooltip': {
                           'signal': "{{'title': datum['index'], '{0}': "
                                     "datum['{0}'], 'W': datum['W']}}".format(
                                         transform_function_name)}}}}]}
        context['vega_spec'] = json.dumps(spec)

    copy_tree(os.path.join(TEMPLATES, 'ancom'), output_dir)
    ancom_results[0].to_csv(os.path.join(output_dir, 'ancom.tsv'),
                            header=True, index=True, sep='\t')
    ancom_results[1].to_csv(os.path.join(output_dir,
                                         'percent-abundances.tsv'),
                            header=True, index=True, sep='\t')
    index = os.path.join(TEMPLATES, 'ancom', 'index.html')
    q2templates.render(index, output_dir, context=context)