Пример #1
0
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata) -> None:
    # Filter metadata to only include IDs present in the distance matrix.
    # Also ensures every distance matrix ID is present in the metadata.
    metadata = metadata.filter_ids(distance_matrix.ids)

    # drop non-numeric columns and empty columns
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='numeric')
    non_numeric_cols = pre_filtered_cols - set(metadata.columns)

    # Drop samples that have any missing values.
    # TODO use Metadata API if more filtering is supported in the future.
    df = metadata.to_dataframe()
    df = df.dropna()
    metadata = qiime2.Metadata(df)

    # filter 0 variance numerical columns and empty columns
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(drop_zero_variance=True,
                                       drop_all_missing=True)
    zero_variance_cols = pre_filtered_cols - set(metadata.columns)

    df = metadata.to_dataframe()

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(df.index)
    filtered_dm_length = distance_matrix.shape[0]

    result = skbio.stats.distance.bioenv(distance_matrix, df)
    result = q2templates.df_to_html(result)

    index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'initial_dm_length':
                           initial_dm_length,
                           'filtered_dm_length':
                           filtered_dm_length,
                           'non_numeric_cols':
                           ', '.join(sorted(non_numeric_cols)),
                           'zero_variance_cols':
                           ', '.join(sorted(zero_variance_cols)),
                           'result':
                           result
                       })
Пример #2
0
def plot(output_dir,
         table: biom.Table,
         metadata: q2.Metadata,
         case_where: str,
         control_where: str,
         feature_tree: skbio.TreeNode = None):

    with open('/tmp/tree.nwk', 'w') as fh:
        feature_tree.write(fh)

    copy_tree(os.path.join(PLOT, 'assets', 'dist'), output_dir)
    data_dir = os.path.join(output_dir, 'data')
    os.mkdir(data_dir)

    metadata = metadata.filter_ids(table.ids(axis='sample'))
    case_samples = sorted(list(metadata.get_ids(case_where)))
    control_samples = sorted(list(metadata.get_ids(control_where)))

    table.filter(case_samples + control_samples)
    table.remove_empty('observation')
    features = list(table.ids(axis='observation'))

    if feature_tree is not None:
        feature_tree = shear_no_prune(feature_tree, features)
    else:
        feature_tree = TreeNode()

    tree_data = tree_to_array(feature_tree)
    idx, = np.where(np.asarray(tree_data['children']) == 0)
    tree_data['lookup'] = dict(zip(map(str, idx), range(len(idx))))

    tip_order = np.asarray(tree_data['names'])[idx]
    table = table.sort_order(tip_order, axis='observation')
    table = table.sort_order(case_samples + control_samples, axis='sample')

    with open(os.path.join(data_dir, 'packed_table.jsonp'), 'w') as fh:
        fh.write('LOAD_PACKED_TABLE(')
        fh.write(json.dumps(table_to_b64pa(table)))
        fh.write(');')

    with open(os.path.join(data_dir, 'tree.jsonp'), 'w') as fh:
        fh.write('LOAD_TREE(')
        fh.write(json.dumps(tree_data))
        fh.write(');')
Пример #3
0
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata) -> None:
    # Filter metadata to only include IDs present in the distance matrix.
    # Also ensures every distance matrix ID is present in the metadata.
    metadata = metadata.filter_ids(distance_matrix.ids)

    # drop non-numeric columns and empty columns
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='numeric')
    non_numeric_cols = pre_filtered_cols - set(metadata.columns)

    # filter 0 variance numerical columns and empty columns
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(drop_zero_variance=True,
                                       drop_all_missing=True)
    zero_variance_cols = pre_filtered_cols - set(metadata.columns)

    # Drop samples that have any missing values.
    # TODO use Metadata API if this type of filtering is supported in the
    # future.
    df = metadata.to_dataframe()
    df = df.dropna(axis='index', how='any')

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(df.index)
    filtered_dm_length = distance_matrix.shape[0]

    result = skbio.stats.distance.bioenv(distance_matrix, df)
    result = q2templates.df_to_html(result)

    index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'non_numeric_cols': ', '.join(sorted(non_numeric_cols)),
        'zero_variance_cols': ', '.join(sorted(zero_variance_cols)),
        'result': result})
Пример #4
0
def summarize(output_dir: str,
              table: biom.Table,
              sample_metadata: qiime2.Metadata = None) -> None:
    number_of_features, number_of_samples = table.shape

    sample_summary, sample_frequencies = _frequency_summary(table,
                                                            axis='sample')
    if number_of_samples > 1:
        # Calculate the bin count, with a minimum of 5 bins
        IQR = sample_summary['3rd quartile'] - sample_summary['1st quartile']
        if IQR == 0.0:
            bins = 5
        else:
            # Freedman–Diaconis rule
            bin_width = (2 * IQR) / (number_of_samples**(1 / 3))

            bins = max((sample_summary['Maximum frequency'] -
                        sample_summary['Minimum frequency']) / bin_width, 5)

        sample_frequencies_ax = sns.distplot(sample_frequencies,
                                             kde=False,
                                             rug=True,
                                             bins=int(round(bins)))
        sample_frequencies_ax.get_xaxis().set_major_formatter(
            matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
        sample_frequencies_ax.set_xlabel('Frequency per sample')
        sample_frequencies_ax.set_ylabel('Number of samples')
        sample_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'sample-frequencies.pdf'))
        sample_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'sample-frequencies.png'))
        plt.gcf().clear()

    feature_summary, feature_frequencies = _frequency_summary(
        table, axis='observation')
    if number_of_features > 1:
        feature_frequencies_ax = sns.distplot(feature_frequencies,
                                              kde=False,
                                              rug=False)
        feature_frequencies_ax.set_xlabel('Frequency per feature')
        feature_frequencies_ax.set_ylabel('Number of features')
        feature_frequencies_ax.set_xscale('log')
        feature_frequencies_ax.set_yscale('log')
        feature_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'feature-frequencies.pdf'))
        feature_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'feature-frequencies.png'))

    sample_summary_table = q2templates.df_to_html(
        sample_summary.apply('{:,}'.format).to_frame('Frequency'))
    feature_summary_table = q2templates.df_to_html(
        feature_summary.apply('{:,}'.format).to_frame('Frequency'))

    index = os.path.join(TEMPLATES, 'summarize_assets', 'index.html')
    context = {
        'number_of_samples': number_of_samples,
        'number_of_features': number_of_features,
        'total_frequencies': int(np.sum(sample_frequencies)),
        'sample_summary_table': sample_summary_table,
        'feature_summary_table': feature_summary_table,
    }

    feature_qualitative_data = _compute_qualitative_summary(table)
    sample_frequencies.sort_values(inplace=True, ascending=False)
    feature_frequencies.sort_values(inplace=True, ascending=False)
    sample_frequencies.to_csv(
        os.path.join(output_dir, 'sample-frequency-detail.csv'))
    feature_frequencies.to_csv(
        os.path.join(output_dir, 'feature-frequency-detail.csv'))

    feature_frequencies = feature_frequencies.astype(int) \
        .apply('{:,}'.format).to_frame('Frequency')
    feature_frequencies['# of Samples Observed In'] = \
        pd.Series(feature_qualitative_data).astype(int).apply('{:,}'.format)
    feature_frequencies_table = q2templates.df_to_html(feature_frequencies)
    overview_template = os.path.join(TEMPLATES, 'summarize_assets',
                                     'overview.html')
    sample_frequency_template = os.path.join(TEMPLATES, 'summarize_assets',
                                             'sample-frequency-detail.html')
    feature_frequency_template = os.path.join(TEMPLATES, 'summarize_assets',
                                              'feature-frequency-detail.html')

    context.update({
        'max_count':
        sample_frequencies.max(),
        'feature_frequencies_table':
        feature_frequencies_table,
        'feature_qualitative_data':
        feature_qualitative_data,
        'tabs': [{
            'url': 'overview.html',
            'title': 'Overview'
        }, {
            'url': 'sample-frequency-detail.html',
            'title': 'Interactive Sample Detail'
        }, {
            'url': 'feature-frequency-detail.html',
            'title': 'Feature Detail'
        }]
    })
    templates = [
        index, sample_frequency_template, feature_frequency_template,
        overview_template
    ]
    q2templates.render(templates, output_dir, context=context)

    shutil.copytree(os.path.join(TEMPLATES, 'summarize_assets', 'dist'),
                    os.path.join(output_dir, 'dist'))

    with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh:
        fh.write("app.init(")
        if sample_metadata:
            sample_metadata = sample_metadata.filter_ids(
                sample_frequencies.index)
            # TODO use Metadata.to_json() API if/when it exists in the future.
            sample_metadata.to_dataframe().to_json(fh)
        else:
            fh.write('{}')
        fh.write(', ')
        sample_frequencies.to_json(fh)
        fh.write(');')
Пример #5
0
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series,
                             metadata: qiime2.Metadata) -> None:
    # Filter metadata to only include IDs present in the alpha diversity data.
    # Also ensures every alpha diversity ID is present in the metadata.
    metadata = metadata.filter_ids(alpha_diversity.index)

    # Metadata column filtering could be done in one pass, but this visualizer
    # displays separate warnings for non-categorical columns, and categorical
    # columns that didn't satisfy the requirements of the statistics being
    # computed.
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='categorical')
    non_categorical_columns = pre_filtered_cols - set(metadata.columns)

    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(drop_all_unique=True,
                                       drop_zero_variance=True,
                                       drop_all_missing=True)
    filtered_columns = pre_filtered_cols - set(metadata.columns)

    if len(metadata.columns) == 0:
        raise ValueError(
            "Metadata does not contain any columns that satisfy this "
            "visualizer's requirements. There must be at least one metadata "
            "column that contains categorical data, isn't empty, doesn't "
            "consist of unique values, and doesn't consist of exactly one "
            "value.")

    metric_name = alpha_diversity.name

    # save out metadata for download in viz
    alpha_diversity.index.name = 'id'
    alpha = qiime2.Metadata(alpha_diversity.to_frame())
    md = metadata.merge(alpha)
    md.save(os.path.join(output_dir, 'metadata.tsv'))

    filenames = []
    filtered_group_comparisons = []
    for column in metadata.columns:
        metadata_column = metadata.get_column(column)
        metadata_column = metadata_column.drop_missing_values()

        initial_data_length = alpha_diversity.shape[0]
        data = pd.concat(
            [alpha_diversity, metadata_column.to_series()],
            axis=1,
            join='inner')
        filtered_data_length = data.shape[0]

        names = []
        groups = []
        for name, group in data.groupby(metadata_column.name):
            names.append('%s (n=%d)' % (name, len(group)))
            groups.append(list(group[metric_name]))

        escaped_column = quote(column)
        escaped_column = escaped_column.replace('/', '%2F')
        filename = 'column-%s.jsonp' % escaped_column
        filenames.append(filename)

        # perform Kruskal-Wallis across all groups
        kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups)

        # perform pairwise Kruskal-Wallis across all pairs of groups and
        # correct for multiple comparisons
        kw_H_pairwise = []
        for i in range(len(names)):
            for j in range(i):
                try:
                    H, p = scipy.stats.mstats.kruskalwallis(
                        groups[i], groups[j])
                    kw_H_pairwise.append([names[j], names[i], H, p])
                except ValueError:
                    filtered_group_comparisons.append([
                        '%s:%s' % (column, names[i]),
                        '%s:%s' % (column, names[j])
                    ])
        kw_H_pairwise = pd.DataFrame(
            kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value'])
        kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True)
        kw_H_pairwise['q-value'] = multipletests(kw_H_pairwise['p-value'],
                                                 method='fdr_bh')[1]
        kw_H_pairwise.sort_index(inplace=True)
        pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_column
        pairwise_path = os.path.join(output_dir, pairwise_fn)
        kw_H_pairwise.to_csv(pairwise_path)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            series = pd.Series(groups, index=names)

            fh.write("load_data('%s'," % column)
            series.to_json(fh, orient='split')
            fh.write(",")
            json.dump(
                {
                    'initial': initial_data_length,
                    'filtered': filtered_data_length
                }, fh)
            fh.write(",")
            json.dump({'H': kw_H_all, 'p': kw_p_all}, fh)
            fh.write(",'")
            table = q2templates.df_to_html(kw_H_pairwise)
            fh.write(table.replace('\n', '').replace("'", "\\'"))
            fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name))

    index = os.path.join(TEMPLATES, 'alpha_group_significance_assets',
                         'index.html')
    q2templates.render(
        index,
        output_dir,
        context={
            'columns': [quote(fn) for fn in filenames],
            'non_categorical_columns':
            ', '.join(sorted(non_categorical_columns)),
            'filtered_columns':
            ', '.join(sorted(filtered_columns)),
            'filtered_group_comparisons':
            '; '.join([' vs '.join(e) for e in filtered_group_comparisons])
        })

    shutil.copytree(
        os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'),
        os.path.join(output_dir, 'dist'))
Пример #6
0
def alpha_rarefaction(output_dir: str,
                      table: biom.Table,
                      max_depth: int,
                      phylogeny: skbio.TreeNode = None,
                      metrics: set = None,
                      metadata: qiime2.Metadata = None,
                      min_depth: int = 1,
                      steps: int = 10,
                      iterations: int = 10) -> None:

    if metrics is None:
        metrics = {'observed_otus', 'shannon'}
        if phylogeny is not None:
            metrics.add('faith_pd')
    elif not metrics:
        raise ValueError('`metrics` was given an empty set.')
    else:
        phylo_overlap = phylogenetic_metrics() & metrics
        if phylo_overlap and phylogeny is None:
            raise ValueError('Phylogenetic metric %s was requested but '
                             'phylogeny was not provided.' % phylo_overlap)

    if max_depth <= min_depth:
        raise ValueError('Provided max_depth of %d must be greater than '
                         'provided min_depth of %d.' % (max_depth, min_depth))
    possible_steps = max_depth - min_depth
    if possible_steps < steps:
        raise ValueError('Provided number of steps (%d) is greater than the '
                         'steps possible between min_depth and '
                         'max_depth (%d).' % (steps, possible_steps))
    if table.is_empty():
        raise ValueError('Provided table is empty.')
    max_frequency = max(table.sum(axis='sample'))
    if max_frequency < max_depth:
        raise ValueError('Provided max_depth of %d is greater than '
                         'the maximum sample total frequency of the '
                         'feature_table (%d).' % (max_depth, max_frequency))

    if metadata is None:
        columns, filtered_columns = set(), set()
    else:
        # Filter metadata to only include sample IDs present in the feature
        # table. Also ensures every feature table sample ID is present in the
        # metadata.
        metadata = metadata.filter_ids(table.ids(axis='sample'))

        # Drop metadata columns that aren't categorical, or consist solely of
        # missing values.
        pre_filtered_cols = set(metadata.columns)
        metadata = metadata.filter_columns(column_type='categorical',
                                           drop_all_missing=True)
        filtered_columns = pre_filtered_cols - set(metadata.columns)

        metadata_df = metadata.to_dataframe()
        metadata_df.columns = pd.MultiIndex.from_tuples([
            (c, '') for c in metadata_df.columns
        ])
        columns = metadata_df.columns.get_level_values(0)

    data = _compute_rarefaction_data(table, min_depth, max_depth, steps,
                                     iterations, phylogeny, metrics)

    filenames = []
    for m, data in data.items():
        metric_name = quote(m)
        filename = '%s.csv' % metric_name

        if metadata is None:
            n_df = _compute_summary(data, 'sample-id')
            jsonp_filename = '%s.jsonp' % metric_name
            _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name,
                                     n_df, '')
            filenames.append(jsonp_filename)
        else:
            merged = data.join(metadata_df, how='left')
            for column in columns:
                column_name = quote(column)
                reindexed_df, counts = _reindex_with_metadata(
                    column, columns, merged)
                c_df = _compute_summary(reindexed_df, column, counts=counts)
                jsonp_filename = "%s-%s.jsonp" % (metric_name, column_name)
                _alpha_rarefaction_jsonp(output_dir, jsonp_filename,
                                         metric_name, c_df, column)
                filenames.append(jsonp_filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            data.columns = [
                'depth-%d_iter-%d' % (t[0], t[1]) for t in data.columns.values
            ]
            if metadata is not None:
                data = data.join(metadata.to_dataframe(), how='left')
            data.to_csv(fh, index_label=['sample-id'])

    index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'metrics': list(metrics),
                           'filenames': [quote(f) for f in filenames],
                           'columns': list(columns),
                           'steps': steps,
                           'filtered_columns': sorted(filtered_columns)
                       })

    shutil.copytree(
        os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'dist'),
        os.path.join(output_dir, 'dist'))
Пример #7
0
def alpha_correlation(output_dir: str,
                      alpha_diversity: pd.Series,
                      metadata: qiime2.Metadata,
                      method: str = 'spearman') -> None:
    try:
        alpha_correlation_fn = _alpha_correlation_fns[method]
    except KeyError:
        raise ValueError('Unknown alpha correlation method %s. The available '
                         'options are %s.' %
                         (method, ', '.join(_alpha_correlation_fns.keys())))

    # Filter metadata to only include IDs present in the alpha diversity data.
    # Also ensures every alpha diversity ID is present in the metadata.
    metadata = metadata.filter_ids(alpha_diversity.index)

    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='numeric',
                                       drop_all_missing=True)
    filtered_columns = pre_filtered_cols - set(metadata.columns)

    if len(metadata.columns) == 0:
        raise ValueError(
            "Metadata contains only non-numeric or empty columns. This "
            "visualizer requires at least one numeric metadata column to "
            "execute.")

    # save out metadata for download in viz
    alpha_diversity.index.name = 'id'
    alpha = qiime2.Metadata(alpha_diversity.to_frame())
    md = metadata.merge(alpha)
    md.save(os.path.join(output_dir, 'metadata.tsv'))

    filenames = []
    for column in metadata.columns:
        metadata_column = metadata.get_column(column)
        metadata_column = metadata_column.drop_missing_values()

        # create a dataframe containing the data to be correlated, and drop
        # any samples that have no data in either column
        df = pd.concat([metadata_column.to_series(), alpha_diversity],
                       axis=1,
                       join='inner')

        # compute correlation
        correlation_result = alpha_correlation_fn(df[metadata_column.name],
                                                  df[alpha_diversity.name])

        warning = None
        if alpha_diversity.shape[0] != df.shape[0]:
            warning = {
                'initial': alpha_diversity.shape[0],
                'method': method.title(),
                'filtered': df.shape[0]
            }

        escaped_column = quote(column)
        filename = 'column-%s.jsonp' % escaped_column
        filenames.append(filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            fh.write("load_data('%s'," % column)
            df.to_json(fh, orient='split')
            fh.write(",")
            json.dump(warning, fh)
            fh.write(",")
            json.dump(
                {
                    'method': method.title(),
                    'testStat': '%1.4f' % correlation_result[0],
                    'pVal': '%1.4f' % correlation_result[1],
                    'sampleSize': df.shape[0]
                }, fh)
            fh.write(");")

    index = os.path.join(TEMPLATES, 'alpha_correlation_assets', 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'columns': [quote(fn) for fn in filenames],
                           'filtered_columns':
                           ', '.join(sorted(filtered_columns))
                       })

    shutil.copytree(
        os.path.join(TEMPLATES, 'alpha_correlation_assets', 'dist'),
        os.path.join(output_dir, 'dist'))
Пример #8
0
def summarize(output_dir: str, table: biom.Table,
              sample_metadata: qiime2.Metadata=None) -> None:
    number_of_features, number_of_samples = table.shape

    sample_summary, sample_frequencies = _frequency_summary(
        table, axis='sample')
    if number_of_samples > 1:
        # Calculate the bin count, with a minimum of 5 bins
        IQR = sample_summary['3rd quartile'] - sample_summary['1st quartile']
        if IQR == 0.0:
            bins = 5
        else:
            # Freedman–Diaconis rule
            bin_width = (2 * IQR) / (number_of_samples ** (1/3))

            bins = max((sample_summary['Maximum frequency'] -
                        sample_summary['Minimum frequency']) / bin_width, 5)

        sample_frequencies_ax = sns.distplot(sample_frequencies, kde=False,
                                             rug=True, bins=int(round(bins)))
        sample_frequencies_ax.get_xaxis().set_major_formatter(
            matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
        sample_frequencies_ax.set_xlabel('Frequency per sample')
        sample_frequencies_ax.set_ylabel('Number of samples')
        sample_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'sample-frequencies.pdf'))
        sample_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'sample-frequencies.png'))
        plt.gcf().clear()

    feature_summary, feature_frequencies = _frequency_summary(
        table, axis='observation')
    if number_of_features > 1:
        feature_frequencies_ax = sns.distplot(feature_frequencies, kde=False,
                                              rug=False)
        feature_frequencies_ax.set_xlabel('Frequency per feature')
        feature_frequencies_ax.set_ylabel('Number of features')
        feature_frequencies_ax.set_xscale('log')
        feature_frequencies_ax.set_yscale('log')
        feature_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'feature-frequencies.pdf'))
        feature_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'feature-frequencies.png'))

    sample_summary_table = q2templates.df_to_html(
        sample_summary.apply('{:,}'.format).to_frame('Frequency'))
    feature_summary_table = q2templates.df_to_html(
        feature_summary.apply('{:,}'.format).to_frame('Frequency'))

    index = os.path.join(TEMPLATES, 'summarize_assets', 'index.html')
    context = {
        'number_of_samples': number_of_samples,
        'number_of_features': number_of_features,
        'total_frequencies': int(np.sum(sample_frequencies)),
        'sample_summary_table': sample_summary_table,
        'feature_summary_table': feature_summary_table,
    }

    feature_qualitative_data = _compute_qualitative_summary(table)
    sample_frequencies.sort_values(inplace=True, ascending=False)
    feature_frequencies.sort_values(inplace=True, ascending=False)
    sample_frequencies.to_csv(
        os.path.join(output_dir, 'sample-frequency-detail.csv'))
    feature_frequencies.to_csv(
        os.path.join(output_dir, 'feature-frequency-detail.csv'))

    feature_frequencies = feature_frequencies.astype(int) \
        .apply('{:,}'.format).to_frame('Frequency')
    feature_frequencies['# of Samples Observed In'] = \
        pd.Series(feature_qualitative_data).astype(int).apply('{:,}'.format)
    feature_frequencies_table = q2templates.df_to_html(feature_frequencies)
    overview_template = os.path.join(
        TEMPLATES, 'summarize_assets', 'overview.html')
    sample_frequency_template = os.path.join(
        TEMPLATES, 'summarize_assets', 'sample-frequency-detail.html')
    feature_frequency_template = os.path.join(
        TEMPLATES, 'summarize_assets', 'feature-frequency-detail.html')

    context.update({'max_count': sample_frequencies.max(),
                    'feature_frequencies_table': feature_frequencies_table,
                    'feature_qualitative_data': feature_qualitative_data,
                    'tabs': [{'url': 'overview.html',
                              'title': 'Overview'},
                             {'url': 'sample-frequency-detail.html',
                              'title': 'Interactive Sample Detail'},
                             {'url': 'feature-frequency-detail.html',
                              'title': 'Feature Detail'}]})
    templates = [index, sample_frequency_template,
                 feature_frequency_template, overview_template]
    q2templates.render(templates, output_dir, context=context)

    shutil.copytree(os.path.join(TEMPLATES, 'summarize_assets', 'dist'),
                    os.path.join(output_dir, 'dist'))

    with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh:
        fh.write("app.init(")
        if sample_metadata:
            sample_metadata = sample_metadata.filter_ids(
                sample_frequencies.index)
            # TODO use Metadata.to_json() API if/when it exists in the future.
            sample_metadata.to_dataframe().to_json(fh)
        else:
            fh.write('{}')
        fh.write(', ')
        sample_frequencies.to_json(fh)
        fh.write(');')
Пример #9
0
def beta_rarefaction(output_dir: str,
                     table: biom.Table,
                     metric: str,
                     clustering_method: str,
                     metadata: qiime2.Metadata,
                     sampling_depth: int,
                     iterations: int = 10,
                     phylogeny: skbio.TreeNode = None,
                     correlation_method: str = 'spearman',
                     color_scheme: str = 'BrBG') -> None:
    with qiime2.sdk.Context() as scope:
        if table.is_empty():
            raise ValueError("Input feature table is empty.")

        # Filter metadata to only include sample IDs present in the feature
        # table. Also ensures every feature table sample ID is present in the
        # metadata.
        metadata = metadata.filter_ids(table.ids(axis='sample'))

        table = qiime2.Artifact.import_data('FeatureTable[Frequency]', table)

        if metric in phylogenetic_metrics():
            if phylogeny is None:
                raise ValueError("A phylogenetic metric (%s) was requested, "
                                 "but a phylogenetic tree was not provided. "
                                 "Phylogeny must be provided when using a "
                                 "phylogenetic diversity metric." % metric)

            phylogeny = qiime2.Artifact.import_data('Phylogeny[Rooted]',
                                                    phylogeny)
            api_method = scope.ctx.get_action('diversity', 'beta_phylogenetic')
            beta_func = functools.partial(api_method, phylogeny=phylogeny)
        else:
            beta_func = scope.ctx.get_action('diversity', 'beta')

        rare_func = scope.ctx.get_action('feature-table', 'rarefy')

        distance_matrices = _get_multiple_rarefaction(beta_func, rare_func,
                                                      metric, iterations,
                                                      table, sampling_depth)

    primary = distance_matrices[0]
    support = distance_matrices[1:]

    heatmap_fig, similarity_df = _make_heatmap(distance_matrices, metric,
                                               correlation_method,
                                               color_scheme)
    heatmap_fig.savefig(os.path.join(output_dir, 'heatmap.svg'))
    similarity_df.to_csv(os.path.join(output_dir,
                                      'rarefaction-iteration-correlation.tsv'),
                         sep='\t')

    tree = _cluster_samples(primary, support, clustering_method)
    tree.write(
        os.path.join(output_dir,
                     'sample-clustering-%s.tre' % clustering_method))

    emperor = _jackknifed_emperor(primary, support, metadata)
    emperor_dir = os.path.join(output_dir, 'emperor')
    emperor.copy_support_files(emperor_dir)
    with open(os.path.join(emperor_dir, 'index.html'), 'w') as fh:
        fh.write(emperor.make_emperor(standalone=True))

    templates = list(
        map(
            lambda page: os.path.join(TEMPLATES, 'beta_rarefaction_assets',
                                      page),
            ['index.html', 'heatmap.html', 'tree.html', 'emperor.html']))

    context = {
        'metric':
        metric,
        'clustering_method':
        clustering_method,
        'tabs': [{
            'url': 'emperor.html',
            'title': 'PCoA'
        }, {
            'url': 'heatmap.html',
            'title': 'Heatmap'
        }, {
            'url': 'tree.html',
            'title': 'Clustering'
        }]
    }

    q2templates.render(templates, output_dir, context=context)
Пример #10
0
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series,
                             metadata: qiime2.Metadata) -> None:
    # Filter metadata to only include IDs present in the alpha diversity data.
    # Also ensures every alpha diversity ID is present in the metadata.
    metadata = metadata.filter_ids(alpha_diversity.index)

    # Metadata column filtering could be done in one pass, but this visualizer
    # displays separate warnings for non-categorical columns, and categorical
    # columns that didn't satisfy the requirements of the statistics being
    # computed.
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='categorical')
    non_categorical_columns = pre_filtered_cols - set(metadata.columns)

    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(
        drop_all_unique=True, drop_zero_variance=True, drop_all_missing=True)
    filtered_columns = pre_filtered_cols - set(metadata.columns)

    if len(metadata.columns) == 0:
        raise ValueError(
            "Metadata does not contain any columns that satisfy this "
            "visualizer's requirements. There must be at least one metadata "
            "column that contains categorical data, isn't empty, doesn't "
            "consist of unique values, and doesn't consist of exactly one "
            "value.")

    metric_name = alpha_diversity.name

    # save out metadata for download in viz
    alpha_diversity.index.name = 'id'
    alpha = qiime2.Metadata(alpha_diversity.to_frame())
    md = metadata.merge(alpha)
    md.save(os.path.join(output_dir, 'metadata.tsv'))

    filenames = []
    filtered_group_comparisons = []
    for column in metadata.columns:
        metadata_column = metadata.get_column(column)
        metadata_column = metadata_column.drop_missing_values()

        initial_data_length = alpha_diversity.shape[0]
        data = pd.concat([alpha_diversity, metadata_column.to_series()],
                         axis=1, join='inner')
        filtered_data_length = data.shape[0]

        names = []
        groups = []
        for name, group in data.groupby(metadata_column.name):
            names.append('%s (n=%d)' % (name, len(group)))
            groups.append(list(group[metric_name]))

        escaped_column = quote(column)
        escaped_column = escaped_column.replace('/', '%2F')
        filename = 'column-%s.jsonp' % escaped_column
        filenames.append(filename)

        # perform Kruskal-Wallis across all groups
        kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups)

        # perform pairwise Kruskal-Wallis across all pairs of groups and
        # correct for multiple comparisons
        kw_H_pairwise = []
        for i in range(len(names)):
            for j in range(i):
                try:
                    H, p = scipy.stats.mstats.kruskalwallis(groups[i],
                                                            groups[j])
                    kw_H_pairwise.append([names[j], names[i], H, p])
                except ValueError:
                    filtered_group_comparisons.append(
                        ['%s:%s' % (column, names[i]),
                         '%s:%s' % (column, names[j])])
        kw_H_pairwise = pd.DataFrame(
            kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value'])
        kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True)
        kw_H_pairwise['q-value'] = multipletests(
            kw_H_pairwise['p-value'], method='fdr_bh')[1]
        kw_H_pairwise.sort_index(inplace=True)
        pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_column
        pairwise_path = os.path.join(output_dir, pairwise_fn)
        kw_H_pairwise.to_csv(pairwise_path)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            series = pd.Series(groups, index=names)

            fh.write("load_data('%s'," % column)
            series.to_json(fh, orient='split')
            fh.write(",")
            json.dump({'initial': initial_data_length,
                       'filtered': filtered_data_length}, fh)
            fh.write(",")
            json.dump({'H': kw_H_all, 'p': kw_p_all}, fh)
            fh.write(",'")
            table = q2templates.df_to_html(kw_H_pairwise)
            fh.write(table.replace('\n', '').replace("'", "\\'"))
            fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name))

    index = os.path.join(
        TEMPLATES, 'alpha_group_significance_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'columns': [quote(fn) for fn in filenames],
        'non_categorical_columns': ', '.join(sorted(non_categorical_columns)),
        'filtered_columns': ', '.join(sorted(filtered_columns)),
        'filtered_group_comparisons':
            '; '.join([' vs '.join(e) for e in filtered_group_comparisons])})

    shutil.copytree(
        os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'),
        os.path.join(output_dir, 'dist'))
Пример #11
0
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int,
                      phylogeny: skbio.TreeNode = None, metrics: set = None,
                      metadata: qiime2.Metadata = None, min_depth: int = 1,
                      steps: int = 10, iterations: int = 10) -> None:

    if metrics is None:
        metrics = {'observed_otus', 'shannon'}
        if phylogeny is not None:
            metrics.add('faith_pd')
    elif not metrics:
        raise ValueError('`metrics` was given an empty set.')
    else:
        phylo_overlap = phylogenetic_metrics() & metrics
        if phylo_overlap and phylogeny is None:
            raise ValueError('Phylogenetic metric %s was requested but '
                             'phylogeny was not provided.' % phylo_overlap)

    if max_depth <= min_depth:
        raise ValueError('Provided max_depth of %d must be greater than '
                         'provided min_depth of %d.' % (max_depth, min_depth))
    possible_steps = max_depth - min_depth
    if possible_steps < steps:
        raise ValueError('Provided number of steps (%d) is greater than the '
                         'steps possible between min_depth and '
                         'max_depth (%d).' % (steps, possible_steps))
    if table.is_empty():
        raise ValueError('Provided table is empty.')
    max_frequency = max(table.sum(axis='sample'))
    if max_frequency < max_depth:
        raise ValueError('Provided max_depth of %d is greater than '
                         'the maximum sample total frequency of the '
                         'feature_table (%d).' % (max_depth, max_frequency))

    if metadata is None:
        columns, filtered_columns = set(), set()
    else:
        # Filter metadata to only include sample IDs present in the feature
        # table. Also ensures every feature table sample ID is present in the
        # metadata.
        metadata = metadata.filter_ids(table.ids(axis='sample'))

        # Drop metadata columns that aren't categorical, or consist solely of
        # missing values.
        pre_filtered_cols = set(metadata.columns)
        metadata = metadata.filter_columns(column_type='categorical',
                                           drop_all_missing=True)
        filtered_columns = pre_filtered_cols - set(metadata.columns)

        metadata_df = metadata.to_dataframe()
        if metadata_df.empty or len(metadata.columns) == 0:
            raise ValueError("All metadata filtered after dropping columns "
                             "that contained non-categorical data.")
        metadata_df.columns = pd.MultiIndex.from_tuples(
            [(c, '') for c in metadata_df.columns])
        columns = metadata_df.columns.get_level_values(0)

    data = _compute_rarefaction_data(table, min_depth, max_depth,
                                     steps, iterations, phylogeny, metrics)

    filenames = []
    for m, data in data.items():
        metric_name = quote(m)
        filename = '%s.csv' % metric_name

        if metadata is None:
            n_df = _compute_summary(data, 'sample-id')
            jsonp_filename = '%s.jsonp' % metric_name
            _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name,
                                     n_df, '')
            filenames.append(jsonp_filename)
        else:
            merged = data.join(metadata_df, how='left')
            for column in columns:
                column_name = quote(column)
                reindexed_df, counts = _reindex_with_metadata(column,
                                                              columns,
                                                              merged)
                c_df = _compute_summary(reindexed_df, column, counts=counts)
                jsonp_filename = "%s-%s.jsonp" % (metric_name, column_name)
                _alpha_rarefaction_jsonp(output_dir, jsonp_filename,
                                         metric_name, c_df, column)
                filenames.append(jsonp_filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            data.columns = ['depth-%d_iter-%d' % (t[0], t[1])
                            for t in data.columns.values]
            if metadata is not None:
                data = data.join(metadata.to_dataframe(), how='left')
            data.to_csv(fh, index_label=['sample-id'])

    index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html')
    q2templates.render(index, output_dir,
                       context={'metrics': list(metrics),
                                'filenames': [quote(f) for f in filenames],
                                'columns': list(columns),
                                'steps': steps,
                                'filtered_columns': sorted(filtered_columns)})

    shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets',
                                 'dist'),
                    os.path.join(output_dir, 'dist'))
Пример #12
0
def alpha_correlation(output_dir: str,
                      alpha_diversity: pd.Series,
                      metadata: qiime2.Metadata,
                      method: str = 'spearman') -> None:
    try:
        alpha_correlation_fn = _alpha_correlation_fns[method]
    except KeyError:
        raise ValueError('Unknown alpha correlation method %s. The available '
                         'options are %s.' %
                         (method, ', '.join(_alpha_correlation_fns.keys())))

    # Filter metadata to only include IDs present in the alpha diversity data.
    # Also ensures every alpha diversity ID is present in the metadata.
    metadata = metadata.filter_ids(alpha_diversity.index)

    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='numeric',
                                       drop_all_missing=True)
    filtered_columns = pre_filtered_cols - set(metadata.columns)

    if len(metadata.columns) == 0:
        raise ValueError(
            "Metadata contains only non-numeric or empty columns. This "
            "visualizer requires at least one numeric metadata column to "
            "execute.")

    # save out metadata for download in viz
    alpha_diversity.index.name = 'id'
    alpha = qiime2.Metadata(alpha_diversity.to_frame())
    md = metadata.merge(alpha)
    md.save(os.path.join(output_dir, 'metadata.tsv'))

    filenames = []
    for column in metadata.columns:
        metadata_column = metadata.get_column(column)
        metadata_column = metadata_column.drop_missing_values()

        # create a dataframe containing the data to be correlated, and drop
        # any samples that have no data in either column
        df = pd.concat([metadata_column.to_series(), alpha_diversity], axis=1,
                       join='inner')

        # compute correlation
        correlation_result = alpha_correlation_fn(df[metadata_column.name],
                                                  df[alpha_diversity.name])

        warning = None
        if alpha_diversity.shape[0] != df.shape[0]:
            warning = {'initial': alpha_diversity.shape[0],
                       'method': method.title(),
                       'filtered': df.shape[0]}

        escaped_column = quote(column)
        filename = 'column-%s.jsonp' % escaped_column
        filenames.append(filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            fh.write("load_data('%s'," % column)
            df.to_json(fh, orient='split')
            fh.write(",")
            json.dump(warning, fh)
            fh.write(",")
            json.dump({
                'method': method.title(),
                'testStat': '%1.4f' % correlation_result[0],
                'pVal': '%1.4f' % correlation_result[1],
                'sampleSize': df.shape[0]}, fh)
            fh.write(");")

    index = os.path.join(TEMPLATES, 'alpha_correlation_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'columns': [quote(fn) for fn in filenames],
        'filtered_columns': ', '.join(sorted(filtered_columns))})

    shutil.copytree(os.path.join(TEMPLATES, 'alpha_correlation_assets',
                                 'dist'),
                    os.path.join(output_dir, 'dist'))
Пример #13
0
def beta_rarefaction(output_dir: str, table: biom.Table, metric: str,
                     clustering_method: str, metadata: qiime2.Metadata,
                     sampling_depth: int, iterations: int=10,
                     phylogeny: skbio.TreeNode=None,
                     correlation_method: str='spearman',
                     color_scheme: str='BrBG') -> None:
    if metric in phylogenetic_metrics():
        if phylogeny is None:
            raise ValueError("A phylogenetic metric (%s) was requested, "
                             "but a phylogenetic tree was not provided. "
                             "Phylogeny must be provided when using a "
                             "phylogenetic diversity metric." % metric)
        beta_func = functools.partial(beta_phylogenetic, phylogeny=phylogeny)
    else:
        beta_func = beta

    if table.is_empty():
        raise ValueError("Input feature table is empty.")

    # Filter metadata to only include sample IDs present in the feature table.
    # Also ensures every feature table sample ID is present in the metadata.
    metadata = metadata.filter_ids(table.ids(axis='sample'))

    distance_matrices = _get_multiple_rarefaction(
        beta_func, metric, iterations, table, sampling_depth)
    primary = distance_matrices[0]
    support = distance_matrices[1:]

    heatmap_fig, similarity_df = _make_heatmap(
        distance_matrices, metric, correlation_method, color_scheme)
    heatmap_fig.savefig(os.path.join(output_dir, 'heatmap.svg'))
    similarity_df.to_csv(
        os.path.join(output_dir, 'rarefaction-iteration-correlation.tsv'),
        sep='\t')

    tree = _cluster_samples(primary, support, clustering_method)
    tree.write(os.path.join(output_dir,
                            'sample-clustering-%s.tre' % clustering_method))

    emperor = _jackknifed_emperor(primary, support, metadata)
    emperor_dir = os.path.join(output_dir, 'emperor')
    emperor.copy_support_files(emperor_dir)
    with open(os.path.join(emperor_dir, 'index.html'), 'w') as fh:
        fh.write(emperor.make_emperor(standalone=True))

    templates = list(map(
        lambda page: os.path.join(TEMPLATES, 'beta_rarefaction_assets', page),
        ['index.html', 'heatmap.html', 'tree.html', 'emperor.html']))

    context = {
        'metric': metric,
        'clustering_method': clustering_method,
        'tabs': [{'url': 'emperor.html',
                  'title': 'PCoA'},
                 {'url': 'heatmap.html',
                  'title': 'Heatmap'},
                 {'url': 'tree.html',
                  'title': 'Clustering'}]
    }

    q2templates.render(templates, output_dir, context=context)
Пример #14
0
def preprocess(
    ctx,
    table,
    metadata,
    sampling_depth,
    min_frequency,
    target_variable,
    discrete,
    phylogeny=None,
    with_replacement=False,
    n_jobs=1,
):

    # Define QIIME2 methods to call
    rarefy = ctx.get_action("feature_table", "rarefy")
    filter_min_features = ctx.get_action("feature_table", "filter_features")
    filter_samples = ctx.get_action("feature_table", "filter_samples")
    beta = ctx.get_action("diversity", "beta")
    beta_phylogenetic = ctx.get_action("diversity", "beta_phylogenetic")
    filter_features = ctx.get_action("fragment-insertion", "filter_features")
    results = []

    print("Inital sizes")
    print_datasize(table, metadata)

    initial_ids_to_keep = table.view(biom.Table).ids()
    table_id_set = set(initial_ids_to_keep)
    metadata_id_set = set(metadata.ids)
    shared_ids = table_id_set.intersection(metadata_id_set)
    num_shared_ids = len(shared_ids)
    if num_shared_ids == 0:
        raise ValueError("No sample IDs are shared between Table and Metadata")
    print(
        "# of shared sample IDs between Table and Metadata: ",
        num_shared_ids, "\n"
    )

    # Filter metadata by samples in table
    print("Filtering Metadata by samples in table")
    filteredmetadata = metadata.filter_ids(ids_to_keep=shared_ids)
    print_datasize(table, filteredmetadata)

    # Filter samples from metadata where NaN in target_variable column
    # Reduce metadata to 1 column mapping of sample-id to target
    print(
        "Filtering samples from Metadata where NaN in target_variable column"
    )
    print("Reducing Metadata to 1 column mapping of sample-id to target")
    df = filteredmetadata.to_dataframe()
    clean_subset_df = clean_metadata(
        df=df, target_variable=target_variable, discrete=discrete
    )
    target_mapping = Metadata(clean_subset_df)
    print_datasize(table, target_mapping)

    # Filter features that do not exist in phylogeny
    if phylogeny:
        print("Filtering features from Table that do not exist in phylogeny")
        phylo_filtered_results = filter_features(table=table, tree=phylogeny)
        table = phylo_filtered_results.filtered_table
        print_datasize(table, target_mapping)

    # Filter low-abundance features from table
    print(
        f"Filtering low-abundance features (frequency<{min_frequency}) from Table"
    )
    (table,) = filter_min_features(
        table=table, min_frequency=min_frequency
    )
    print_datasize(table, target_mapping)

    # Rarefy Table to sampling_depth
    print(f"Rarefying Table to sampling depth of {sampling_depth}")
    (rarefied_table,) = rarefy(
        table=table,
        sampling_depth=sampling_depth,
        with_replacement=with_replacement,
    )
    print_datasize(rarefied_table, target_mapping)

    print("Filtering Rarefied Table by samples in Metadata")
    filtered_rarefied_table_results = filter_samples(
        table=rarefied_table, metadata=target_mapping
    )
    filtered_rarefied_table = filtered_rarefied_table_results.filtered_table
    print_datasize(filtered_rarefied_table, target_mapping)
    results += filtered_rarefied_table_results

    # Refilter target_mapping by samples in table
    print("Refiltering Metadata by samples in Rarefied Table")
    ids_to_keep = filtered_rarefied_table.view(biom.Table).ids()
    target_mapping = target_mapping.filter_ids(ids_to_keep=ids_to_keep)
    print_datasize(filtered_rarefied_table, target_mapping)

    # Filter Rarefied Table by samples in metadata
    print("Filtering Unrarefied Table by samples in Metadata to match Rarefied Table")
    filtered_table_results = filter_samples(
        table=table, metadata=target_mapping
    )
    print_datasize(filtered_table_results.filtered_table, target_mapping)
    results += filtered_table_results

    # Some transformations to get data into correct format for artifact
    target_mapping_col = target_mapping.get_column(target_variable)
    target_mapping_series = target_mapping_col.to_series()
    print("Reindexing Metadata to match Sample ID order of Table")
    target_mapping_series = target_mapping_series.reindex(
        index=ids_to_keep, copy=False
    )
    print("Validating Table and Metadata Sample ID agreement...")
    if list(target_mapping_series.index) != list(ids_to_keep):
        print(list(target_mapping_series.index))
        print(ids_to_keep)
        raise ValueError(
            "Table and Metadata Sample IDs do not match in contents and/or order"
        )
    target_mapping_artifact = ctx.make_artifact(
        "SampleData[Target]", target_mapping_series
    )
    results += [target_mapping_artifact]

    # Generate Distance Matrices
    print("Generating Distance Matrices...")
    for metric in ["jaccard", "braycurtis", "jensenshannon", "aitchison"]:
        beta_results = beta(
            table=filtered_rarefied_table, metric=metric, n_jobs=n_jobs
        )
        results += beta_results
    if phylogeny:
        for metric in ["unweighted_unifrac", "weighted_unifrac"]:
            beta_phylo_results = beta_phylogenetic(
                table=filtered_rarefied_table,
                phylogeny=phylogeny,
                metric=metric,
                threads=n_jobs,
            )
            results += beta_phylo_results
    else:
        # No phylogeny, return empty (1,1) matrices.
        results += 2*[Artifact.import_data(
            "DistanceMatrix", skbio.DistanceMatrix(data=[])
        )]
    return tuple(results)
Пример #15
0
        print("%s sample pairs matched together" %
              (len(case_to_control_match.keys())))

        for key in case_to_control_match:
            key_value = case_to_control_match[key]
            matchDF.at[key, "matched_to"] = str(key_value)
            matchDF.at[key_value, "matched_to"] = str(key)
    else:
        print("%s cases matched" % (len(case_dictionary.keys())))
        for case in case_dictionary:
            for control in case_dictionary[case]:
                if control in control_dictionary:
                    control_dictionary[control].append(case)
                else:
                    control_dictionary[control] = [case]
            matchDF.at[case,
                       "matched_to"] = ", ".join(sorted(case_dictionary[case]))

        for control in control_dictionary:
            matchDF.at[control, "matched_to"] = ", ".join(
                sorted(control_dictionary[control]))

    matchedMD = Metadata(matchDF)
    if only_matches:
        ids = matchedMD.get_ids("matched_to NOT IN ('none')")
        #shrinks the MD to only have matched samples
        matchedMD = matchedMD.filter_ids(ids)

    return matchedMD
Пример #16
0
def simple_plot(output_dir,
                table: biom.Table,
                feature_tree: skbio.TreeNode,
                metadata: q2.Metadata,
                case_where: str,
                control_where: str,
                n_transects: int = 10,
                stratify_by: str = None,
                mode: str = 'max'):
    print("Data extracted")
    layer_dir = os.path.join(output_dir, 'layers')
    rank_dir = os.path.join(output_dir, 'ranks')
    os.mkdir(layer_dir)
    os.mkdir(rank_dir)

    metadata = metadata.filter_ids(table.ids(axis='sample'))
    case_samples = sorted(list(metadata.get_ids(case_where)))
    control_samples = sorted(list(metadata.get_ids(control_where)))
    get_pairs = comparisons(metadata, control_samples, case_samples,
                            stratify_by)

    table.filter(case_samples + control_samples)
    table.remove_empty('observation')
    features = list(table.ids(axis='observation'))
    feature_tree = shear_no_prune(feature_tree, features)
    print("Extraneous features removed")

    for n in feature_tree.traverse():
        if not n.length:
            n.length = 0
    tree = tree_to_array(feature_tree, mode)
    print("Tree index created")

    possible_transects = len(np.unique(np.asarray(tree['distances'])))
    tree_length = tree['distances'][0]  # root of tree
    if n_transects > possible_transects:
        n_transects = possible_transects
        print("Only %d transects exist, using that instead" % n_transects)

    transects = list(np.linspace(0, tree_length, num=n_transects))
    print("Will transect at: %s" % ", ".join(map(str, transects)))

    figure_gen = prepare_plot(tree_length)
    figure_gen.send(None)  # initialize co-routine
    colors = []

    points, _ = pairwise_components(table, get_pairs())
    color_fig, highlight_fig, color = figure_gen.send((points, None))

    color_fig.savefig(os.path.join(layer_dir, 'original.png'),
                      transparent=True)
    plt.close(color_fig)
    highlight_fig.savefig(os.path.join(layer_dir, 'original.h.png'),
                          transparent=True)
    plt.close(highlight_fig)
    colors.append(color)

    rank_files = []
    collapsed_groups = pd.DataFrame()
    for distance in transects:
        collapsed_table, collapsed_counts, groups = group_by_transect(
            table, tree, distance)
        collapsed_groups[groups.name] = groups
        print("Table collapsed at transect %s" % distance)

        points, ranks = pairwise_components(collapsed_table, get_pairs())

        filename = write_ranks(rank_dir, collapsed_counts, ranks, distance)
        rank_files.append(filename)

        color_fig, highlight_fig, color = figure_gen.send((points, distance))
        colors.append(color)

        color_fig.savefig(os.path.join(layer_dir, 'T_%s.png' % distance),
                          transparent=True)
        plt.close(color_fig)
        highlight_fig.savefig(os.path.join(layer_dir, 'T_%s.h.png' % distance),
                              transparent=True)
        plt.close(highlight_fig)

    print("Finalizing visualization")
    figure = figure_gen.send((None, None))
    figure.savefig(os.path.join(layer_dir, 'trajectory.png'), transparent=True)
    plt.close(figure)

    background = next(figure_gen)
    background.savefig(os.path.join(layer_dir, 'bg.png'), transparent=True)
    plt.close(background)

    with open(os.path.join(output_dir, 'collapsed_groups.tsv'), 'w') as fh:
        collapsed_groups.to_csv(fh, sep='\t')

    with open(os.path.join(output_dir, 'index.html'), 'w') as fh:
        template = Environment(loader=BaseLoader).from_string(TEMPLATE)
        fh.write(
            template.render({
                'legend':
                list(
                    zip(['original'] + ['T_%s' % d
                                        for d in transects] + ['trajectory'],
                        list(map(to_hex, colors)) + ['red'])),
                'filenames':
                rank_files
            }))