Python DistanceMatrix.filter示例，skbio.DistanceMatrix.filter Python示例

示例#1

0

显示文件

def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata) -> None:
    # convert metadata to numeric values where applicable, drop the non-numeric
    # values, and then drop samples that contain NaNs
    df = metadata.to_dataframe()
    df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))

    # filter categorical columns
    pre_filtered_cols = set(df.columns)
    df = df.select_dtypes([numpy.number]).dropna()
    filtered_categorical_cols = pre_filtered_cols - set(df.columns)

    # filter 0 variance numerical columns
    pre_filtered_cols = set(df.columns)
    df = df.loc[:, df.var() != 0]
    filtered_zero_variance_cols = pre_filtered_cols - set(df.columns)

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(df.index, strict=False)
    filtered_dm_length = distance_matrix.shape[0]

    result = skbio.stats.distance.bioenv(distance_matrix, df)
    result = result.to_html(classes='table table-striped table-hover').replace(
        'border="1"', 'border="0"')

    index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'filtered_categorical_cols': ', '.join(filtered_categorical_cols),
        'filtered_zero_variance_cols': ', '.join(filtered_zero_variance_cols),
        'result': result})

示例#2

0

显示文件

文件： _visualizer.py 项目： jairideout/diversity

def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata) -> None:
    # convert metadata to numeric values where applicable, drop the non-numeric
    # values, and then drop samples that contain NaNs
    df = metadata.to_dataframe()
    df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))

    # filter categorical columns
    pre_filtered_cols = set(df.columns)
    df = df.select_dtypes([numpy.number]).dropna()
    filtered_categorical_cols = pre_filtered_cols - set(df.columns)

    # filter 0 variance numerical columns
    pre_filtered_cols = set(df.columns)
    df = df.loc[:, df.var() != 0]
    filtered_zero_variance_cols = pre_filtered_cols - set(df.columns)

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(df.index, strict=False)
    filtered_dm_length = distance_matrix.shape[0]

    result = skbio.stats.distance.bioenv(distance_matrix, df)
    result = result.to_html(classes='table table-striped table-hover').replace(
        'border="1"', 'border="0"')

    index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'filtered_categorical_cols': ', '.join(filtered_categorical_cols),
        'filtered_zero_variance_cols': ', '.join(filtered_zero_variance_cols),
        'result': result})

示例#3

0

显示文件

文件： _filter.py 项目： nervous-laughter/q2-diversity

def filter_distance_matrix(distance_matrix: skbio.DistanceMatrix,
                           sample_metadata: qiime2.Metadata,
                           where: str = None) -> skbio.DistanceMatrix:
    ids_to_keep = sample_metadata.ids(where=where)
    # NOTE: there is no guaranteed ordering to output distance matrix because
    # `ids_to_keep` is a set, and `DistanceMatrix.filter` uses its iteration
    # order.
    try:
        return distance_matrix.filter(ids_to_keep, strict=False)
    except skbio.stats.distance.DissimilarityMatrixError:
        raise ValueError(
            "All samples were filtered out of the distance matrix.")

示例#4

0

显示文件

def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata) -> None:
    # Filter metadata to only include IDs present in the distance matrix.
    # Also ensures every distance matrix ID is present in the metadata.
    metadata = metadata.filter_ids(distance_matrix.ids)

    # drop non-numeric columns and empty columns
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='numeric')
    non_numeric_cols = pre_filtered_cols - set(metadata.columns)

    # Drop samples that have any missing values.
    # TODO use Metadata API if more filtering is supported in the future.
    df = metadata.to_dataframe()
    df = df.dropna()
    metadata = qiime2.Metadata(df)

    # filter 0 variance numerical columns and empty columns
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(drop_zero_variance=True,
                                       drop_all_missing=True)
    zero_variance_cols = pre_filtered_cols - set(metadata.columns)

    df = metadata.to_dataframe()

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(df.index)
    filtered_dm_length = distance_matrix.shape[0]

    result = skbio.stats.distance.bioenv(distance_matrix, df)
    result = q2templates.df_to_html(result)

    index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'initial_dm_length':
                           initial_dm_length,
                           'filtered_dm_length':
                           filtered_dm_length,
                           'non_numeric_cols':
                           ', '.join(sorted(non_numeric_cols)),
                           'zero_variance_cols':
                           ', '.join(sorted(zero_variance_cols)),
                           'result':
                           result
                       })

示例#5

0

显示文件

文件： _filter.py 项目： gregcaporaso/diversity

def filter_distance_matrix(distance_matrix: skbio.DistanceMatrix,
                           metadata: qiime2.Metadata,
                           where: str=None,
                           exclude_ids: bool=False) -> skbio.DistanceMatrix:
    ids_to_keep = metadata.ids(where=where)
    if exclude_ids:
        ids_to_keep = set(distance_matrix.ids) - set(ids_to_keep)
    # NOTE: there is no guaranteed ordering to output distance matrix because
    # `ids_to_keep` is a set, and `DistanceMatrix.filter` uses its iteration
    # order.
    try:
        return distance_matrix.filter(ids_to_keep, strict=False)
    except skbio.stats.distance.DissimilarityMatrixError:
        raise ValueError(
            "All samples were filtered out of the distance matrix.")

示例#6

0

显示文件

文件： _visualizer.py 项目： qiime2/q2-diversity

def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata) -> None:
    # Filter metadata to only include IDs present in the distance matrix.
    # Also ensures every distance matrix ID is present in the metadata.
    metadata = metadata.filter_ids(distance_matrix.ids)

    # drop non-numeric columns and empty columns
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='numeric')
    non_numeric_cols = pre_filtered_cols - set(metadata.columns)

    # filter 0 variance numerical columns and empty columns
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(drop_zero_variance=True,
                                       drop_all_missing=True)
    zero_variance_cols = pre_filtered_cols - set(metadata.columns)

    # Drop samples that have any missing values.
    # TODO use Metadata API if this type of filtering is supported in the
    # future.
    df = metadata.to_dataframe()
    df = df.dropna(axis='index', how='any')

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(df.index)
    filtered_dm_length = distance_matrix.shape[0]

    result = skbio.stats.distance.bioenv(distance_matrix, df)
    result = q2templates.df_to_html(result)

    index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'non_numeric_cols': ', '.join(sorted(non_numeric_cols)),
        'zero_variance_cols': ', '.join(sorted(zero_variance_cols)),
        'result': result})

示例#7

0

显示文件

文件： _visualizer.py 项目： gwarmstrong/q2-diversity

def mantel(output_dir: str,
           dm1: skbio.DistanceMatrix,
           dm2: skbio.DistanceMatrix,
           method: str = 'spearman',
           permutations: int = 999,
           intersect_ids: bool = False,
           label1: str = 'Distance Matrix 1',
           label2: str = 'Distance Matrix 2') -> None:
    test_statistics = {'spearman': 'rho', 'pearson': 'r'}
    alt_hypothesis = 'two-sided'

    # The following code to handle mismatched IDs, and subsequently filter the
    # distance matrices, is not technically necessary because skbio's mantel
    # function will raise an error on mismatches with `strict=True`, and will
    # handle intersection if `strict=False`. However, we need to handle the ID
    # matching explicitly to find *which* IDs are mismatched -- the error
    # message coming from scikit-bio doesn't describe those. We also need to
    # have the mismatched IDs to display as a warning in the viz if
    # `intersect_ids=True`. Finally, the distance matrices are explicitly
    # filtered to matching IDs only because their data are used elsewhere in
    # this function (e.g. extracting scatter plot data).

    # Find the symmetric difference between ID sets.
    ids1 = set(dm1.ids)
    ids2 = set(dm2.ids)
    mismatched_ids = ids1 ^ ids2

    if not intersect_ids and mismatched_ids:
        raise ValueError(
            'The following ID(s) are not contained in both distance matrices. '
            'This sometimes occurs when mismatched files are passed. If this '
            'is not the case, you can use `intersect_ids` to discard these '
            'mismatches and apply the Mantel test to only those IDs that are '
            'found in both distance matrices.\n\n%s' %
            ', '.join(sorted(mismatched_ids)))

    if mismatched_ids:
        matched_ids = ids1 & ids2
        # Run in `strict` mode because the matches should all be found in both
        # matrices.
        dm1 = dm1.filter(matched_ids, strict=True)
        dm2 = dm2.filter(matched_ids, strict=True)

    # Run in `strict` mode because all IDs should be matched at this point.
    r, p, sample_size = skbio.stats.distance.mantel(dm1,
                                                    dm2,
                                                    method=method,
                                                    permutations=permutations,
                                                    alternative=alt_hypothesis,
                                                    strict=True)

    result = pd.Series(
        [method.title(), sample_size, permutations, alt_hypothesis, r, p],
        index=[
            'Method', 'Sample size', 'Permutations', 'Alternative hypothesis',
            '%s %s' % (method.title(), test_statistics[method]), 'p-value'
        ],
        name='Mantel test results')
    table_html = q2templates.df_to_html(result.to_frame())

    # We know the distance matrices have matching ID sets at this point, so we
    # can safely generate all pairs of IDs using one of the matrices' ID sets
    # (it doesn't matter which one).
    scatter_data = []
    for id1, id2 in itertools.combinations(dm1.ids, 2):
        scatter_data.append((dm1[id1, id2], dm2[id1, id2]))

    plt.figure()
    x = 'Pairwise Distance (%s)' % label1
    y = 'Pairwise Distance (%s)' % label2
    scatter_data = pd.DataFrame(scatter_data, columns=[x, y])
    sns.regplot(x=x, y=y, data=scatter_data, fit_reg=False)
    plt.savefig(os.path.join(output_dir, 'mantel-scatter.svg'))

    context = {
        'table': table_html,
        'sample_size': sample_size,
        'mismatched_ids': mismatched_ids
    }
    index = os.path.join(TEMPLATES, 'mantel_assets', 'index.html')
    q2templates.render(index, output_dir, context=context)

示例#8

0

显示文件

文件： _visualizer.py 项目： gwarmstrong/q2-diversity

def beta_group_significance(output_dir: str,
                            distance_matrix: skbio.DistanceMatrix,
                            metadata: qiime2.CategoricalMetadataColumn,
                            method: str = 'permanova',
                            pairwise: bool = False,
                            permutations: int = 999) -> None:
    try:
        beta_group_significance_fn = _beta_group_significance_fns[method]
    except KeyError:
        raise ValueError('Unknown group significance method %s. The available '
                         'options are %s.' %
                         (method, ', '.join(_beta_group_significance_fns)))

    # Filter metadata to only include IDs present in the distance matrix.
    # Also ensures every distance matrix ID is present in the metadata.
    metadata = metadata.filter_ids(distance_matrix.ids)
    metadata = metadata.drop_missing_values()

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata due to missing values, and keep track of how many samples
    # survived the filtering so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(metadata.ids)
    filtered_dm_length = distance_matrix.shape[0]

    metadata = metadata.to_series()

    # Run the significance test
    result = beta_group_significance_fn(distance_matrix,
                                        metadata,
                                        permutations=permutations)

    # Generate distance boxplots
    sns.set_style('white')
    # Identify the groups, then compute the within group distances and the
    # between group distances, and generate one boxplot per group.
    # groups will be an OrderedDict mapping group id to the sample ids in that
    # group. The order is used both on the x-axis, and in the layout of the
    # boxplots in the visualization.
    # TODO: update to use a grouping API and natsort API on
    # CategoricalMetadataColumn, if those become available.
    groupings = collections.OrderedDict([
        (id, list(series.index))
        for id, series in natsorted(metadata.groupby(metadata))
    ])

    pairs_summary = pd.DataFrame(
        columns=['SubjectID1', 'SubjectID2', 'Group1', 'Group2', 'Distance'])
    for group_id in groupings:
        group_distances, x_ticklabels, group_pairs_summary = \
            _get_distance_boxplot_data(distance_matrix, group_id, groupings)

        group_pairs_summary = pd.DataFrame(group_pairs_summary,
                                           columns=[
                                               'SubjectID1', 'SubjectID2',
                                               'Group1', 'Group2', 'Distance'
                                           ])

        pairs_summary = pd.concat([pairs_summary, group_pairs_summary])

        ax = sns.boxplot(data=group_distances,
                         flierprops={
                             'marker': 'o',
                             'markeredgecolor': 'black',
                             'markeredgewidth': 0.5,
                             'alpha': 0.5
                         })
        ax.set_xticklabels(x_ticklabels, rotation=90)
        ax.set_xlabel('Group')
        ax.set_ylabel('Distance')
        ax.set_title('Distances to %s' % group_id)
        # change the color of the boxes to white
        for box in ax.artists:
            box.set_facecolor('white')
        sns.despine()
        plt.tight_layout()
        fig = ax.get_figure()
        fig.savefig(
            os.path.join(output_dir, '%s-boxplots.png' %
                         urllib.parse.quote(str(group_id))))
        fig.savefig(
            os.path.join(output_dir, '%s-boxplots.pdf' %
                         urllib.parse.quote(str(group_id))))
        fig.clear()

    pairs_summary.to_csv(os.path.join(output_dir, 'raw_data.tsv'), sep='\t')

    result_html = q2templates.df_to_html(result.to_frame())

    if pairwise:
        pairwise_results = []
        for group1_id, group2_id in itertools.combinations(groupings, 2):
            pairwise_result = \
                _get_pairwise_group_significance_stats(
                    distance_matrix=distance_matrix,
                    group1_id=group1_id,
                    group2_id=group2_id,
                    groupings=groupings,
                    metadata=metadata,
                    beta_group_significance_fn=beta_group_significance_fn,
                    permutations=permutations)
            pairwise_results.append([
                group1_id, group2_id, pairwise_result['sample size'],
                permutations, pairwise_result['test statistic'],
                pairwise_result['p-value']
            ])
        columns = [
            'Group 1', 'Group 2', 'Sample size', 'Permutations',
            result['test statistic name'], 'p-value'
        ]
        pairwise_results = pd.DataFrame(pairwise_results, columns=columns)
        pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True)
        pairwise_results['q-value'] = multipletests(
            pairwise_results['p-value'], method='fdr_bh')[1]
        pairwise_results.sort_index(inplace=True)
        pairwise_path = os.path.join(output_dir, '%s-pairwise.csv' % method)
        pairwise_results.to_csv(pairwise_path)

        pairwise_results_html = q2templates.df_to_html(pairwise_results)
    else:
        pairwise_results_html = None

    # repartition groupings for rendering
    group_ids = [
        # We have to DOUBLE encode this, as the file/resource name is a literal
        # URI-encoded string, we do this to prevent issues with the filesystem
        # however, as a result, our links need to escape % so that the browser
        # asks for the right escaped name (instead of the original name, which
        # doesn't exist inside the visualization).
        urllib.parse.quote(urllib.parse.quote(k)) for k in groupings.keys()
    ]
    row_count, group_count = 3, len(group_ids)  # Start at three plots per row
    while group_count % row_count != 0:
        row_count = row_count - 1

    group_rows = [
        group_ids[g:g + row_count] for g in range(0, group_count, row_count)
    ]

    index = os.path.join(TEMPLATES, 'beta_group_significance_assets',
                         'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'initial_dm_length': initial_dm_length,
                           'filtered_dm_length': filtered_dm_length,
                           'method': method,
                           'group_rows': group_rows,
                           'bootstrap_group_col_size': int(12 / row_count),
                           'result': result_html,
                           'pairwise_results': pairwise_results_html
                       })

示例#9

0

显示文件

文件： test_mantel.py 项目： nbresnick/scikit-bio

class PairwiseMantelTests(TestCase):
    def setUp(self):
        self.minx = DistanceMatrix([[0, 1, 2], [1, 0, 3], [2, 3, 0]])
        self.miny = DistanceMatrix([[0, 2, 7], [2, 0, 6], [7, 6, 0]])
        self.minz = DistanceMatrix([[0, 0.5, 0.25],
                                    [0.5, 0, 0.1],
                                    [0.25, 0.1, 0]])
        self.min_dms = (self.minx, self.miny, self.minz)

        # Versions of self.minx and self.minz (above) that each have an extra
        # ID on the end.
        self.x_extra = DistanceMatrix([[0, 1, 2, 7],
                                       [1, 0, 3, 2],
                                       [2, 3, 0, 4],
                                       [7, 2, 4, 0]], ['0', '1', '2', 'foo'])
        self.z_extra = DistanceMatrix([[0, 0.5, 0.25, 3],
                                       [0.5, 0, 0.1, 24],
                                       [0.25, 0.1, 0, 5],
                                       [3, 24, 5, 0]], ['0', '1', '2', 'bar'])

        # Load expected results. We have to load the p-value column (column
        # index 3) as a string dtype in order to compare with the in-memory
        # results since we're formatting the p-values as strings with the
        # correct number of decimal places. Without this explicit converter,
        # the p-value column will be loaded as a float dtype and the frames
        # won't compare equal.
        p_val_conv = {3: str}

        self.exp_results_minimal = pd.read_csv(
            get_data_path('pwmantel_exp_results_minimal.txt'), sep='\t',
            index_col=(0, 1), converters=p_val_conv)

        self.exp_results_minimal_with_labels = pd.read_csv(
            get_data_path('pwmantel_exp_results_minimal_with_labels.txt'),
            sep='\t', index_col=(0, 1), converters=p_val_conv)

        self.exp_results_duplicate_dms = pd.read_csv(
            get_data_path('pwmantel_exp_results_duplicate_dms.txt'),
            sep='\t', index_col=(0, 1), converters=p_val_conv)

        self.exp_results_na_p_value = pd.read_csv(
            get_data_path('pwmantel_exp_results_na_p_value.txt'),
            sep='\t', index_col=(0, 1), converters=p_val_conv)

        self.exp_results_too_few_permutations = pd.read_csv(
            get_data_path('pwmantel_exp_results_too_few_permutations.txt'),
            sep='\t', index_col=(0, 1), converters=p_val_conv)

        self.exp_results_reordered_distance_matrices = pd.read_csv(
            get_data_path('pwmantel_exp_results_reordered_distance_matrices'
                          '.txt'),
            sep='\t', index_col=(0, 1), converters=p_val_conv)

    def test_minimal_compatible_input(self):
        # Matrices are already in the correct order and have matching IDs.
        np.random.seed(0)

        obs = pwmantel(self.min_dms, alternative='greater')
        assert_frame_equal(obs, self.exp_results_minimal)

    def test_minimal_compatible_input_with_labels(self):
        np.random.seed(0)

        obs = pwmantel(self.min_dms, alternative='greater',
                       labels=('minx', 'miny', 'minz'))
        assert_frame_equal(obs, self.exp_results_minimal_with_labels)

    def test_duplicate_dms(self):
        obs = pwmantel((self.minx, self.minx, self.minx), alternative='less')
        assert_frame_equal(obs, self.exp_results_duplicate_dms)

    def test_na_p_value(self):
        obs = pwmantel((self.miny, self.minx), method='spearman',
                       permutations=0)
        assert_frame_equal(obs, self.exp_results_na_p_value)

    def test_too_few_permutations_for_p_value(self):
        obs = pwmantel((self.miny, self.minx), method='spearman',
                       permutations=9)
        assert_frame_equal(obs, self.exp_results_too_few_permutations)

    def test_reordered_distance_matrices(self):
        # Matrices have matching IDs but they all have different ordering.
        x = self.minx.filter(['1', '0', '2'])
        y = self.miny.filter(['0', '2', '1'])
        z = self.minz.filter(['1', '2', '0'])

        np.random.seed(0)

        obs = pwmantel((x, y, z), alternative='greater')
        assert_frame_equal(obs, self.exp_results_reordered_distance_matrices)

    def test_strict(self):
        # Matrices have some matching and nonmatching IDs, with different
        # ordering.
        x = self.x_extra.filter(['1', '0', 'foo', '2'])
        y = self.miny.filter(['0', '2', '1'])
        z = self.z_extra.filter(['bar', '1', '2', '0'])

        np.random.seed(0)

        # strict=False should discard IDs that aren't found in both matrices
        obs = pwmantel((x, y, z), alternative='greater', strict=False)
        assert_frame_equal(obs, self.exp_results_reordered_distance_matrices)

        with self.assertRaises(ValueError):
            pwmantel((x, y, z), strict=True)

    def test_id_lookup(self):
        # Matrices have mismatched IDs but a lookup is provided.
        self.x_extra.ids = ['a', 'b', 'c', 'foo']
        self.z_extra.ids = ['d', 'e', 'f', 'bar']
        lookup = {'a': '0', 'b': '1', 'c': '2', 'foo': 'foo',
                  'd': '0', 'e': '1', 'f': '2', 'bar': 'bar',
                  '0': '0', '1': '1', '2': '2'}

        x = self.x_extra.filter(['b', 'a', 'foo', 'c'])
        y = self.miny.filter(['0', '2', '1'])
        z = self.z_extra.filter(['bar', 'e', 'f', 'd'])

        x_copy = x.copy()
        y_copy = y.copy()
        z_copy = z.copy()

        np.random.seed(0)

        obs = pwmantel((x, y, z), alternative='greater', strict=False,
                       lookup=lookup)
        assert_frame_equal(obs, self.exp_results_reordered_distance_matrices)

        # Make sure the inputs aren't modified.
        self.assertEqual(x, x_copy)
        self.assertEqual(y, y_copy)
        self.assertEqual(z, z_copy)

    def test_too_few_dms(self):
        with self.assertRaises(ValueError):
            pwmantel([self.miny])

    def test_invalid_input_type(self):
        with self.assertRaises(TypeError):
            pwmantel([self.miny, self.minx, [[0, 42], [42, 0]]])

    def test_wrong_number_of_labels(self):
        with self.assertRaises(ValueError):
            pwmantel(self.min_dms, labels=['foo', 'bar'])

    def test_duplicate_labels(self):
        with self.assertRaises(ValueError):
            pwmantel(self.min_dms, labels=['foo', 'bar', 'foo'])

    def test_missing_ids_in_lookup(self):
        # mapping for '1' is missing
        lookup = {'0': 'a', '2': 'c'}

        with self.assertRaises(KeyError):
            pwmantel(self.min_dms, lookup=lookup)

    def test_no_matching_ids(self):
        self.minx.ids = ['foo', 'bar', 'baz']
        self.miny.ids = ['bro', 'fist', 'breh']

        with self.assertRaises(ValueError):
            pwmantel((self.minx, self.miny, self.minz), strict=False)

示例#10

0

显示文件

def beta_group_significance(output_dir: str,
                            distance_matrix: skbio.DistanceMatrix,
                            metadata: qiime2.MetadataCategory,
                            method: str='permanova',
                            permutations: int=999) -> None:
    try:
        beta_group_significance_fn = _beta_group_significance_fns[method]
    except KeyError:
        raise ValueError('Unknown group significance method %s. The available '
                         'options are %s.' %
                         (method,
                          ', '.join(_beta_group_significance_fns)))

    # Cast metadata to numeric (if applicable), which gives better sorting
    # in boxplots. Then filter any samples that are not in the distance matrix,
    # and drop samples with have no data for this metadata
    # category, including those with empty strings as values.
    metadata = pd.to_numeric(metadata.to_series(), errors='ignore')
    metadata = metadata.loc[list(distance_matrix.ids)]
    metadata = metadata.replace(r'', numpy.nan).dropna()

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(metadata.index)
    filtered_dm_length = distance_matrix.shape[0]

    # Run the significance test
    result = beta_group_significance_fn(distance_matrix, metadata,
                                        permutations=permutations)

    # Generate distance boxplots
    sns.set_style("white")
    # Identify the groups, then compute the within group distances and the
    # between group distances, and generate one boxplot per group.
    # groups will be an OrderedDict mapping group id to the sample ids in that
    # group. The order is used both on the x-axis, and in the layout of the
    # boxplots in the visualization.
    groupings = collections.OrderedDict(
        [(id, list(series.index))
         for id, series in sorted(metadata.groupby(metadata))])

    for group_id in groupings:
        group_distances, x_ticklabels = \
            _get_distance_boxplot_data(distance_matrix, group_id, groupings)

        ax = sns.boxplot(data=group_distances, flierprops={
            'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5,
            'alpha': 0.5})
        ax.set_xticklabels(x_ticklabels, rotation=90)
        ax.set_xlabel('Group')
        ax.set_ylabel('Distance')
        ax.set_title('Distances to %s' % group_id)
        # change the color of the boxes to white
        for box in ax.artists:
            box.set_facecolor('white')
        sns.despine()
        plt.tight_layout()
        fig = ax.get_figure()
        fig.savefig(os.path.join(output_dir, '%s-boxplots.png' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.clear()

    result = result.to_frame().to_html(classes="table table-striped "
                                       "table-hover")
    result = result.replace('border="1"', 'border="0"')
    index = os.path.join(
        TEMPLATES, 'beta_group_significance_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'method': method,
        'groupings': groupings,
        'result': result
    })

示例#11

0

显示文件

文件： bench_general.py 项目： ElDeveloper/regression-benchmarking

        print("Running default parameters for " + name)
    
    is_distmatrix = name in dm_set #Boolean switch for distance-matrix specific code blocks
    if is_distmatrix: ##### Use specific X and y for distance matrix benchmarking, not amplicon experiment object
        
        if name=="jensenshannon":
            md = exp.sample_metadata
            existing_dm = DistanceMatrix.read(dir_prefix+"/beta-q2/"+"aitchison"+'.txt')
            print("Computing Jensen-Shannon Distance Matrix")
            dm = DistanceMatrix(data=distance.pdist(exp.data.todense(), metric="jensenshannon"), ids=existing_dm.ids)
        else:
            dm = DistanceMatrix.read(dir_prefix+"/beta-q2/"+name+'.txt')
            
        md = exp.sample_metadata
        md = md.filter(dm.ids,axis='index')
        dm = dm.filter(md.index, strict=True)
        
        X_dist = dm.data
        y_dist = md[target]
        
    # Make directory for this regressor if it does not yet exist
    dir_name = dir_prefix +'/' +dir_prefix + '-' + name
    print(dir_name)
    if not os.path.isdir(dir_name): 
        os.mkdir(dir_name, mode=0o755)

    paramsList = list(ParameterGrid(grid))   
       
    # For each set of parameters, get scores for model across 10 folds
    for param_idx, param in enumerate(paramsList):

示例#12

0

显示文件

文件： _visualizer.py 项目： qiime2/q2-diversity

def mantel(output_dir: str, dm1: skbio.DistanceMatrix,
           dm2: skbio.DistanceMatrix, method: str = 'spearman',
           permutations: int = 999, intersect_ids: bool = False,
           label1: str = 'Distance Matrix 1',
           label2: str = 'Distance Matrix 2') -> None:
    test_statistics = {'spearman': 'rho', 'pearson': 'r'}
    alt_hypothesis = 'two-sided'

    # The following code to handle mismatched IDs, and subsequently filter the
    # distance matrices, is not technically necessary because skbio's mantel
    # function will raise an error on mismatches with `strict=True`, and will
    # handle intersection if `strict=False`. However, we need to handle the ID
    # matching explicitly to find *which* IDs are mismatched -- the error
    # message coming from scikit-bio doesn't describe those. We also need to
    # have the mismatched IDs to display as a warning in the viz if
    # `intersect_ids=True`. Finally, the distance matrices are explicitly
    # filtered to matching IDs only because their data are used elsewhere in
    # this function (e.g. extracting scatter plot data).

    # Find the symmetric difference between ID sets.
    ids1 = set(dm1.ids)
    ids2 = set(dm2.ids)
    mismatched_ids = ids1 ^ ids2

    if not intersect_ids and mismatched_ids:
        raise ValueError(
            'The following ID(s) are not contained in both distance matrices. '
            'This sometimes occurs when mismatched files are passed. If this '
            'is not the case, you can use `intersect_ids` to discard these '
            'mismatches and apply the Mantel test to only those IDs that are '
            'found in both distance matrices.\n\n%s'
            % ', '.join(sorted(mismatched_ids)))

    if mismatched_ids:
        matched_ids = ids1 & ids2
        # Run in `strict` mode because the matches should all be found in both
        # matrices.
        dm1 = dm1.filter(matched_ids, strict=True)
        dm2 = dm2.filter(matched_ids, strict=True)

    # Run in `strict` mode because all IDs should be matched at this point.
    r, p, sample_size = skbio.stats.distance.mantel(
            dm1, dm2, method=method, permutations=permutations,
            alternative=alt_hypothesis, strict=True)

    result = pd.Series([method.title(), sample_size, permutations,
                       alt_hypothesis, r, p],
                       index=['Method', 'Sample size', 'Permutations',
                              'Alternative hypothesis',
                              '%s %s' % (method.title(),
                                         test_statistics[method]),
                              'p-value'],
                       name='Mantel test results')
    table_html = q2templates.df_to_html(result.to_frame())

    # We know the distance matrices have matching ID sets at this point, so we
    # can safely generate all pairs of IDs using one of the matrices' ID sets
    # (it doesn't matter which one).
    scatter_data = []
    for id1, id2 in itertools.combinations(dm1.ids, 2):
        scatter_data.append((dm1[id1, id2], dm2[id1, id2]))

    plt.figure()
    x = 'Pairwise Distance (%s)' % label1
    y = 'Pairwise Distance (%s)' % label2
    scatter_data = pd.DataFrame(scatter_data, columns=[x, y])
    sns.regplot(x=x, y=y, data=scatter_data, fit_reg=False)
    plt.savefig(os.path.join(output_dir, 'mantel-scatter.svg'))

    context = {
        'table': table_html,
        'sample_size': sample_size,
        'mismatched_ids': mismatched_ids
    }
    index = os.path.join(
        TEMPLATES, 'mantel_assets', 'index.html')
    q2templates.render(index, output_dir, context=context)

示例#13

0

显示文件

文件： _visualizer.py 项目： qiime2/q2-diversity

def beta_group_significance(output_dir: str,
                            distance_matrix: skbio.DistanceMatrix,
                            metadata: qiime2.CategoricalMetadataColumn,
                            method: str = 'permanova',
                            pairwise: bool = False,
                            permutations: int = 999) -> None:
    try:
        beta_group_significance_fn = _beta_group_significance_fns[method]
    except KeyError:
        raise ValueError('Unknown group significance method %s. The available '
                         'options are %s.' %
                         (method,
                          ', '.join(_beta_group_significance_fns)))

    # Filter metadata to only include IDs present in the distance matrix.
    # Also ensures every distance matrix ID is present in the metadata.
    metadata = metadata.filter_ids(distance_matrix.ids)
    metadata = metadata.drop_missing_values()

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata due to missing values, and keep track of how many samples
    # survived the filtering so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(metadata.ids)
    filtered_dm_length = distance_matrix.shape[0]

    metadata = metadata.to_series()

    # Run the significance test
    result = beta_group_significance_fn(distance_matrix, metadata,
                                        permutations=permutations)

    # Generate distance boxplots
    sns.set_style('white')
    # Identify the groups, then compute the within group distances and the
    # between group distances, and generate one boxplot per group.
    # groups will be an OrderedDict mapping group id to the sample ids in that
    # group. The order is used both on the x-axis, and in the layout of the
    # boxplots in the visualization.
    # TODO: update to use a grouping API and natsort API on
    # CategoricalMetadataColumn, if those become available.
    groupings = collections.OrderedDict(
        [(id, list(series.index))
         for id, series in natsorted(metadata.groupby(metadata))])

    pairs_summary = pd.DataFrame(columns=['SubjectID1', 'SubjectID2', 'Group1',
                                          'Group2', 'Distance'])
    for group_id in groupings:
        group_distances, x_ticklabels, group_pairs_summary = \
            _get_distance_boxplot_data(distance_matrix, group_id, groupings)

        group_pairs_summary = pd.DataFrame(
            group_pairs_summary, columns=['SubjectID1', 'SubjectID2',
                                          'Group1', 'Group2', 'Distance'])

        pairs_summary = pd.concat([pairs_summary, group_pairs_summary])

        ax = sns.boxplot(data=group_distances, flierprops={
            'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5,
            'alpha': 0.5})
        ax.set_xticklabels(x_ticklabels, rotation=90)
        ax.set_xlabel('Group')
        ax.set_ylabel('Distance')
        ax.set_title('Distances to %s' % group_id)
        # change the color of the boxes to white
        for box in ax.artists:
            box.set_facecolor('white')
        sns.despine()
        plt.tight_layout()
        fig = ax.get_figure()
        fig.savefig(os.path.join(output_dir, '%s-boxplots.png' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.clear()

    pairs_summary.to_csv(os.path.join(output_dir, 'raw_data.tsv'), sep='\t')

    result_html = q2templates.df_to_html(result.to_frame())

    if pairwise:
        pairwise_results = []
        for group1_id, group2_id in itertools.combinations(groupings, 2):
            pairwise_result = \
                _get_pairwise_group_significance_stats(
                    distance_matrix=distance_matrix,
                    group1_id=group1_id,
                    group2_id=group2_id,
                    groupings=groupings,
                    metadata=metadata,
                    beta_group_significance_fn=beta_group_significance_fn,
                    permutations=permutations)
            pairwise_results.append([group1_id,
                                     group2_id,
                                     pairwise_result['sample size'],
                                     permutations,
                                     pairwise_result['test statistic'],
                                     pairwise_result['p-value']])
        columns = ['Group 1', 'Group 2', 'Sample size', 'Permutations',
                   result['test statistic name'], 'p-value']
        pairwise_results = pd.DataFrame(pairwise_results, columns=columns)
        pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True)
        pairwise_results['q-value'] = multipletests(
            pairwise_results['p-value'], method='fdr_bh')[1]
        pairwise_results.sort_index(inplace=True)
        pairwise_path = os.path.join(
            output_dir, '%s-pairwise.csv' % method)
        pairwise_results.to_csv(pairwise_path)

        pairwise_results_html = q2templates.df_to_html(pairwise_results)
    else:
        pairwise_results_html = None

    # repartition groupings for rendering
    group_ids = list(groupings.keys())
    row_count, group_count = 3, len(group_ids)  # Start at three plots per row
    while group_count % row_count != 0:
        row_count = row_count - 1

    group_rows = [group_ids[g:g+row_count] for g in range(0, group_count,
                                                          row_count)]

    index = os.path.join(
        TEMPLATES, 'beta_group_significance_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'method': method,
        'group_rows': group_rows,
        'bootstrap_group_col_size': int(12 / row_count),
        'result': result_html,
        'pairwise_results': pairwise_results_html
    })

示例#14

0

显示文件

文件： _visualizer.py 项目： jairideout/diversity

def beta_group_significance(output_dir: str,
                            distance_matrix: skbio.DistanceMatrix,
                            metadata: qiime2.MetadataCategory,
                            method: str='permanova',
                            pairwise: bool=False,
                            permutations: int=999) -> None:
    try:
        beta_group_significance_fn = _beta_group_significance_fns[method]
    except KeyError:
        raise ValueError('Unknown group significance method %s. The available '
                         'options are %s.' %
                         (method,
                          ', '.join(_beta_group_significance_fns)))

    # Cast metadata to numeric (if applicable), which gives better sorting
    # in boxplots. Then filter any samples that are not in the distance matrix,
    # and drop samples with have no data for this metadata
    # category, including those with empty strings as values.
    metadata = pd.to_numeric(metadata.to_series(), errors='ignore')
    metadata = metadata.loc[list(distance_matrix.ids)]
    metadata = metadata.replace(r'', numpy.nan).dropna()

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(metadata.index)
    filtered_dm_length = distance_matrix.shape[0]

    # Run the significance test
    result = beta_group_significance_fn(distance_matrix, metadata,
                                        permutations=permutations)

    # Generate distance boxplots
    sns.set_style("white")
    # Identify the groups, then compute the within group distances and the
    # between group distances, and generate one boxplot per group.
    # groups will be an OrderedDict mapping group id to the sample ids in that
    # group. The order is used both on the x-axis, and in the layout of the
    # boxplots in the visualization.
    groupings = collections.OrderedDict(
        [(id, list(series.index))
         for id, series in sorted(metadata.groupby(metadata))])

    for group_id in groupings:
        group_distances, x_ticklabels = \
            _get_distance_boxplot_data(distance_matrix, group_id, groupings)

        ax = sns.boxplot(data=group_distances, flierprops={
            'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5,
            'alpha': 0.5})
        ax.set_xticklabels(x_ticklabels, rotation=90)
        ax.set_xlabel('Group')
        ax.set_ylabel('Distance')
        ax.set_title('Distances to %s' % group_id)
        # change the color of the boxes to white
        for box in ax.artists:
            box.set_facecolor('white')
        sns.despine()
        plt.tight_layout()
        fig = ax.get_figure()
        fig.savefig(os.path.join(output_dir, '%s-boxplots.png' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' %
                                 urllib.parse.quote_plus(str(group_id))))
        fig.clear()

    result_html = result.to_frame().to_html(classes=("table table-striped "
                                                     "table-hover"))
    result_html = result_html.replace('border="1"', 'border="0"')

    if pairwise:
        pairwise_results = []
        for group1_id, group2_id in itertools.combinations(groupings, 2):
            pairwise_result = \
                _get_pairwise_group_significance_stats(
                    distance_matrix=distance_matrix,
                    group1_id=group1_id,
                    group2_id=group2_id,
                    groupings=groupings,
                    metadata=metadata,
                    beta_group_significance_fn=beta_group_significance_fn,
                    permutations=permutations)
            pairwise_results.append([group1_id,
                                     group2_id,
                                     pairwise_result['sample size'],
                                     permutations,
                                     pairwise_result['test statistic'],
                                     pairwise_result['p-value']])
        columns = ['Group 1', 'Group 2', 'Sample size', 'Permutations',
                   result['test statistic name'], 'p-value']
        pairwise_results = pd.DataFrame(pairwise_results, columns=columns)
        pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True)
        pairwise_results['q-value'] = multipletests(
            pairwise_results['p-value'], method='fdr_bh')[1]
        pairwise_results.sort_index(inplace=True)
        pairwise_path = os.path.join(
            output_dir, '%s-pairwise.csv' % method)
        pairwise_results.to_csv(pairwise_path)

        pairwise_results_html = pairwise_results.to_html(
            classes=("table table-striped table-hover"))
        pairwise_results_html = pairwise_results_html.replace(
            'border="1"', 'border="0"')
    else:
        pairwise_results_html = None

    index = os.path.join(
        TEMPLATES, 'beta_group_significance_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'method': method,
        'groupings': groupings,
        'result': result_html,
        'pairwise_results': pairwise_results_html
    })