示例#1
0
    def test_biom_match(self):
        table = Table(
            np.array([[0, 0, 1, 1],
                      [2, 3, 4, 4],
                      [5, 5, 3, 3]]).T,
            ['a', 'b', 'c', 'd'],
            ['s2', 's3', 's4'])
        md = pd.DataFrame(
            {
                'x1': [1, 3, 2],
                'x2': [1, 1, 0]
            },
            columns=['s1', 's2', 's3']
        ).T

        exp_table = Table(
            np.array(
                [
                    [0, 0, 1, 1],
                    [2, 3, 4, 4]
                ]).T,
            ['a', 'b', 'c', 'd'],
            ['s2', 's3'])
        exp_md = pd.DataFrame(
            {
                'x1': [3, 2],
                'x2': [1, 0]
            },
            columns=['s2', 's3']
        ).T

        res_table, res_md = match(table, md)
        exp_df = pd.DataFrame(exp_table.to_dataframe())
        res_df = pd.DataFrame(res_table.to_dataframe())

        exp_df = exp_df.reindex_axis(sorted(exp_df.columns), axis=1)
        res_df = res_df.reindex_axis(sorted(res_df.columns), axis=1)

        pdt.assert_frame_equal(exp_df, res_df)

        exp_md = exp_md.reindex_axis(sorted(exp_md.index), axis=0)
        res_md = res_md.reindex_axis(sorted(res_md.index), axis=0)

        pdt.assert_frame_equal(res_md, exp_md)
示例#2
0
def rpca(
    table: biom.Table,
    n_components: int = DEFAULT_RANK,
    min_sample_count: int = DEFAULT_MSC,
    min_feature_count: int = DEFAULT_MFC,
    max_iterations: int = DEFAULT_ITERATIONS
) -> (skbio.OrdinationResults, skbio.DistanceMatrix):
    """Runs RPCA with an rclr preprocessing step.

       This code will be run by both the standalone and QIIME 2 versions of
       DEICODE.
    """

    # filter sample to min depth
    def sample_filter(val, id_, md):
        return sum(val) > min_sample_count

    def observation_filter(val, id_, md):
        return sum(val) > min_feature_count

    # filter and import table
    table = table.filter(observation_filter, axis='observation')
    table = table.filter(sample_filter, axis='sample')
    table = table.to_dataframe().T
    if len(table.index) != len(set(table.index)):
        raise ValueError('Data-table contains duplicate indices')
    if len(table.columns) != len(set(table.columns)):
        raise ValueError('Data-table contains duplicate columns')

    # rclr preprocessing and OptSpace (RPCA)
    opt = MatrixCompletion(n_components=n_components,
                           max_iterations=max_iterations).fit(rclr(table))

    rename_cols = ['PC' + str(i + 1) for i in range(n_components)]
    X = opt.sample_weights @ opt.s @ opt.feature_weights.T
    X = X - X.mean(axis=0)
    X = X - X.mean(axis=1).reshape(-1, 1)
    u, s, v = svd(X)
    u = u[:, :n_components]
    v = v.T[:, :n_components]
    p = s**2 / np.sum(s**2)
    p = p[:n_components]
    s = s[:n_components]
    feature_loading = pd.DataFrame(v, index=table.columns, columns=rename_cols)
    sample_loading = pd.DataFrame(u, index=table.index, columns=rename_cols)

    # % var explained
    proportion_explained = pd.Series(p, index=rename_cols)
    # get eigenvalues
    eigvals = pd.Series(s, index=rename_cols)

    # if the n_components is two add PC3 of zeros
    # this is referenced as in issue in
    # <https://github.com/biocore/emperor/commit
    # /a93f029548c421cb0ba365b4294f7a5a6b0209ce>
    # discussed in DEICODE -- PR#29
    if n_components == 2:
        feature_loading['PC3'] = [0] * len(feature_loading.index)
        sample_loading['PC3'] = [0] * len(sample_loading.index)
        eigvals.loc['PC3'] = 0
        proportion_explained.loc['PC3'] = 0

    # save ordination results
    short_method_name = 'rpca_biplot'
    long_method_name = '(Robust Aitchison) RPCA Biplot'
    ord_res = skbio.OrdinationResults(
        short_method_name,
        long_method_name,
        eigvals.copy(),
        samples=sample_loading.copy(),
        features=feature_loading.copy(),
        proportion_explained=proportion_explained.copy())
    # save distance matrix
    dist_res = skbio.stats.distance.DistanceMatrix(opt.distance,
                                                   ids=sample_loading.index)

    return ord_res, dist_res
示例#3
0
def get_matched_tables(collated_fingerprints: pd.DataFrame,
                       smiles: pd.DataFrame, feature_table: biom.Table):
    '''
    This function filters the feature table to retain only features with
    fingerprints. It also relabels features with MD5 hash of its
    binary fingerprint vector.

    Parameters
    ----------
    collated_fingerprints : pd.DataFrame
        table containing mass-spec molecular substructures (columns) for each
        mass-spec feature (index)
    smiles: pd.DataFrame
        table containing smiles for each mass-spec feature (index)
    feature_table : biom.Table
        feature tables with mass-spec feature intensity per sample.

    Raises
    ------
    ValueError
        If features in collated fingerprint table are not a subset of
        features in ``feature_table``

    Returns
    -------
    pd.DataFrame
        fingerprint table with features relabeled with MD5 hash of
        its binary fingerprint vector
    biom.Table
        feature table that is filtered to contain only the
        features with predicted fingerprints. Features are labeled by MD5 hash
        of its binary fingerprint vector
    pd.DataFrame
        table that maps MD5 hash of a feature to the original feature ID in
        the input feature table
    '''
    fps = collated_fingerprints.copy()
    allfps = list(fps.index)
    if fps.empty:
        raise ValueError("Cannot have empty fingerprint table")
    table = feature_table.to_dataframe(dense=True)
    allfeatrs = set(table.index)
    if not set(allfps).issubset(allfeatrs):
        extra_tips = set(allfps) - set(allfps).intersection(allfeatrs)
        raise ValueError('The following tips were not '
                         'found in the feature table:\n' +
                         ', '.join([str(i) for i in extra_tips]))
    filtered_table = table.reindex(allfps)
    list_md5 = []
    for fid in allfps:
        md5 = str(hashlib.md5(fps.loc[fid].values.tobytes()).hexdigest())
        list_md5.append(md5)
    fps['label'] = list_md5
    filtered_table['label'] = list_md5
    feature_data = pd.DataFrame(columns=[
        'label', '#featureID', 'csi_smiles', 'ms2_smiles', 'ms2_compound',
        'ms2_adduct'
    ])
    feature_data['label'] = list_md5
    feature_data['#featureID'] = allfps
    feature_data['csi_smiles'] = list(smiles.loc[allfps, 'csi_smiles'])
    feature_data['ms2_smiles'] = list(smiles.loc[allfps, 'ms2_smiles'])
    feature_data['ms2_compound'] = list(smiles.loc[allfps, 'ms2_compound'])
    feature_data['ms2_adduct'] = list(smiles.loc[allfps, 'ms2_adduct'])
    feature_data.set_index('label', inplace=True)
    relabel_fps = fps.groupby('label').first()
    matched_table = filtered_table.groupby('label').sum()
    # biom requires that ids be strings
    npfeatures = matched_table.values
    matched_table = biom.table.Table(
        data=npfeatures,
        observation_ids=matched_table.index.astype(str),
        sample_ids=matched_table.columns.astype(str))

    return relabel_fps, matched_table, feature_data
示例#4
0
def process(infile1: biom.Table,
            sample_types: MetadataColumn,
            metric: Str,
            conditioning: Str,
            infile2: biom.Table = None,
            name: Str = "-name-",
            ab_comp: Bool = False,
            min_count: Int = 3,
            total_select: Str = "all",
            iteration_select: Set[Int] = None,
            pca_components: Int = 4,
            smooth_type: Str = "sliding_window",
            window_size: Int = 3,
            centrality: Str = None,
            keep_threshold: Float = 0.5,
            correlation: Str = None,
            weighted: Bool = False,
            correlation_prop: Str = "both",
            evaluation: Str = "kl_divergence",
            min_connected: Int = 0,
            detailed: Bool = False) -> list:
    """
    This is function corresponds with qiime2 function and takes care of file passing between all parts of plugin.

    :param infile1: This is the biom file (qza) which will have OTU info extracted from and analyzed to generate
        an interaction table of taxom.
    :param sample_types: the is metadata representive of samples taken and whether there are invaded/natural
    :param metric: This is the metric to use
    :param conditioning: Conditioning type to use on the data.
    :param infile2: This is only used in the case of an A/B analysis and will not be used if ab_comp is False.
    :param name: This is attached to all detailed output as a means of identification
    :param ab_comp: Boolean representing whether to perform AB comparison on the data.
    :param min_count: Features with counts below this number will be removed.
    :param total_select: Number of features to select in total. ie: 1,2,3,... or 'all'
    :param iteration_select: Number of features to select for each time the metric is called. ie: 1,2,3,...
    :param pca_components: Number of pca components to find
    :param smooth_type:  Type of Smoothing to be used to remove noise.
    :param window_size:  If Smoothing type is a sliding window, this is the size of the window.
    :param centrality: If graph_centrality is the metric type, this is the type of Centrality to use.
    :param keep_threshold: If graph_centrality is the metric type, this is the threshold to use to remove weak edges.
    :param correlation: If graph centrality is the metric, this specifies if positive, negative, or both types
        of correlation should be used.
    :param weighted: If graph_centrality is the metric type, this specifies if weighted edges should be used
        to create the graph.
    :param correlation_prop:
    :param evaluation: This is the evaluation type to use.
    :param min_connected: The minimum percentage of connectedness of the graph that should be considered before the winnowing process is aborted.
    :param detailed: Notifies plugin to output diagrams and csv files to each steps respective output folder throughout
        computation. If not enabled files will not be generated
    :return: return a list of single item with artifact see artifact generation for details on why this is done
    """

    print(
        "\n############################# START #############################")
    if iteration_select is None:  # Since default parameter can't be mutable
        iteration_select = {1, 4, 16, 64, 128}

    # make sure proper file structure is present
    _verify_output_folders()
    _verbose(step=0)

    # This will be used as part of the PERMANOVA calculation
    if (not isinstance(sample_types, pd.DataFrame)
        ):  # allows for easier testing and input directly to python
        sample_types = sample_types.to_dataframe()
    # Make sure input is valid
    num_samples = len(
        infile1.ids(axis='observation')
    )  # this accounts for abundances being same size as well in later steps
    try:
        if ("type" in sample_types.columns):
            num_sample_types = len(sample_types.loc[:, "type"])
        else:
            num_sample_types = len(sample_types.loc[:, "Type"])
    except:
        raise Exception(
            "Error: sample metadata must include a column titled Type.")
    if (num_samples != num_sample_types):
        raise Exception(
            "Error: each provided sample must have a corresponding type. ( natural/invaded ).\n"
            f"Was given {num_samples} samples and {num_sample_types} types. ")

    # Verify parameters are all given
    _verify_input_is_provided(metric, conditioning, ab_comp, infile2,
                              centrality, correlation)

    # if ab_comp is used we will assume that each sample type corresponds with the 1 - n sample of each dataframe
    if (ab_comp):
        sample_types = pd.concat([sample_types, sample_types],
                                 ignore_index=True)

    metric_output = pd.DataFrame()  # dataframe to write metrics new
    auc_output = pd.DataFrame()  # Keep most accurate AUC
    permanova_output = pd.DataFrame()  # Keep most accurate PERMANOVA value
    _verbose(step=0.5)

    for iteration_selected in sorted(iteration_select):

        # Convert input to dataframes
        dataframe_1 = infile1.to_dataframe().to_dense()
        dataframe_1.name = f"{name}_1_{iteration_selected}_"
        dataframe_2 = None
        if (ab_comp):
            dataframe_2 = infile2.to_dataframe().to_dense()
            dataframe_2.name = f"{name}_2_{iteration_selected}_"
            if (len(dataframe_1) != len(dataframe_2)):
                raise Exception(
                    f"Error: Dataframes must be the same size in order to correlate with sample metadata. "
                    f"dataframe1: {len(dataframe_1)} != dataframe2: {len(dataframe_2)}"
                )

        name_new = f"{name}_{iteration_selected}_"  # will allow for easier iteration selection

        # <><><> Pass data to steps 1 to 3 <><><>
        _verbose(step=1)
        metric_result, important_features, abundances = \
            _winnow_pipeline( dataframe_1=dataframe_1, dataframe_2=dataframe_2, ab_comp=ab_comp, metric_name=metric,
                              c_type=conditioning, min_count=min_count, total_select=total_select, iteration_select=iteration_selected,
                              pca_components=pca_components, smooth_type=smooth_type, window_size=window_size,
                              centrality_type=centrality, keep_threshold=keep_threshold, correlation=correlation,
                              weighted=weighted, corr_prop=correlation_prop, evaluation_type=evaluation,
                              min_connected=min_connected, detailed=detailed )
        # these are used in: Step7_9, Step4_5, Step6

        if (metric_output.empty
            ):  # create a dataframe of import OTU's for jaccard step
            metric_output = metric_result
        else:
            if (len(metric_output.columns) < len(metric_result.columns)):
                # the dataframe must be extended to be able to hold new data
                new_columns = [
                    col for col in metric_result.columns
                    if not col in metric_output.columns
                ]
                for col in new_columns:
                    metric_output[col] = ""  # Default as empty
            metric_output = pd.concat(
                [metric_output, metric_result], sort=False, ignore_index=True
            )  # assign back since does not perform in place

        # check if a metric result was generated before attempting other steps, must be atleast 2 OTUs
        if (1 in metric_result.columns and 2 in metric_result.columns):

            # <><><> Pass data to steps 4 to 5 <><><>
            _verbose(step=4)
            auc_results, auc_parameters = \
                _winnow_ordering( dataframe=important_features, name=name_new, detailed=detailed )
            # these are used in: Step6, None
            auc_output = auc_results

            # Note: sample types correspond with abundances being passed
            # print( abundances, auc_results, sample_types )

            # <><><> Pass data to step 6 <><><>
            _verbose(step=6)
            permanova_results = \
                _winnow_permanova( auc_ordering_df=auc_results, abundances_df=abundances, samples_df=sample_types,
                                   centrality_type=centrality, name=name_new, detailed=detailed )
            permanova_output = permanova_results
            _verbose(step=6.5)

        else:
            _verbose(step=1.5)

    # <><><>  Pass data to steps 7 to 9 <><><>
    _verbose(step=7)
    jaccard_results = _winnow_sensativity(
        metric_output,
        name=
        f"{metric}_{correlation}_{str(keep_threshold)}_{centrality}_{name}",
        detailed=detailed)

    # Notify user of output path
    _verbose(step=10)
    print(
        f"Please see:\n\t{os.path.dirname(os.path.realpath(__file__))}\nfolder for detailed output."
    )
    print("############################# DONE #############################")

    # assemble output and return as artifact
    metric_output.replace(r'^\s*$', np.nan, regex=True,
                          inplace=True)  # Replace blank with Nan
    artifact_directory = _assemble_artifact_output(metric_output, auc_output,
                                                   permanova_output,
                                                   jaccard_results)

    return artifact_directory
示例#5
0
def multinomial(table: biom.Table,
                metadata: Metadata,
                formula: str,
                training_column: str = DEFAULTS["training-column"],
                num_random_test_examples: int = (
                    DEFAULTS["num-random-test-examples"]
                ),
                epochs: int = DEFAULTS["epochs"],
                batch_size: int = DEFAULTS["batch-size"],
                differential_prior: float = DEFAULTS["differential-prior"],
                learning_rate: float = DEFAULTS["learning-rate"],
                clipnorm: float = DEFAULTS["clipnorm"],
                min_sample_count: int = DEFAULTS["min-sample-count"],
                min_feature_count: int = DEFAULTS["min-feature-count"],
                summary_interval: int = DEFAULTS["summary-interval"],
                random_seed: int = DEFAULTS["random-seed"],
                ) -> (
                    pd.DataFrame, qiime2.Metadata, skbio.OrdinationResults
                ):

    # load metadata and tables
    metadata = metadata.to_dataframe()
    # match them
    table, metadata, design = match_and_filter(
        table, metadata,
        formula, min_sample_count, min_feature_count
    )

    # convert to dense representation
    dense_table = table.to_dataframe().to_dense().T

    # split up training and testing
    trainX, testX, trainY, testY = split_training(
        dense_table, metadata, design,
        training_column, num_random_test_examples,
        seed=random_seed,
    )

    model = MultRegression(learning_rate=learning_rate, clipnorm=clipnorm,
                           beta_mean=differential_prior,
                           batch_size=batch_size,
                           save_path=None)
    with tf.Graph().as_default(), tf.Session() as session:
        tf.set_random_seed(random_seed)
        model(session, trainX, trainY, testX, testY)

        loss, cv, its = model.fit(
            epochs=epochs,
            summary_interval=summary_interval,
            checkpoint_interval=None)

    md_ids = np.array(design.columns)
    obs_ids = table.ids(axis='observation')

    beta_ = np.hstack((np.zeros((model.p, 1)), model.B))
    beta_ = beta_ - beta_.mean(axis=1).reshape(-1, 1)

    differentials = pd.DataFrame(
        beta_.T, columns=md_ids, index=obs_ids,
    )
    differentials.index.name = 'featureid'

    convergence_stats = pd.DataFrame(
        {
            'loss': loss,
            'cross-validation': cv,
            'iteration': its
        }
    )

    convergence_stats.index.name = 'id'
    convergence_stats.index = convergence_stats.index.astype(np.str)

    c = convergence_stats['loss'].astype(np.float)
    convergence_stats['loss'] = c

    c = convergence_stats['cross-validation'].astype(np.float)
    convergence_stats['cross-validation'] = c

    c = convergence_stats['iteration'].astype(np.int)
    convergence_stats['iteration'] = c

    # regression biplot
    if differentials.shape[-1] > 1:
        u, s, v = np.linalg.svd(differentials)
        pc_ids = ['PC%d' % i for i in range(len(s))]
        samples = pd.DataFrame(u[:, :len(s)] @ np.diag(s),
                               columns=pc_ids, index=differentials.index)
        features = pd.DataFrame(v.T[:, :len(s)],
                                columns=pc_ids, index=differentials.columns)
        short_method_name = 'regression_biplot'
        long_method_name = 'Multinomial regression biplot'
        eigvals = pd.Series(s, index=pc_ids)
        proportion_explained = eigvals**2 / (eigvals**2).sum()
        biplot = OrdinationResults(
            short_method_name, long_method_name, eigvals,
            samples=samples, features=features,
            proportion_explained=proportion_explained)
    else:
        # this is to handle the edge case with only intercepts
        biplot = OrdinationResults('', '', pd.Series(), pd.DataFrame())

    return differentials, qiime2.Metadata(convergence_stats), biplot
示例#6
0
def multinomial(table: biom.Table,
                metadata: Metadata,
                formula: str,
                training_column: str = None,
                num_random_test_examples: int = 10,
                epoch: int = 10,
                batch_size: int = 5,
                beta_prior: float = 1,
                learning_rate: float = 0.1,
                clipnorm: float = 10,
                min_sample_count: int = 10,
                min_feature_count: int = 10,
                summary_interval: int = 60) -> (
                    pd.DataFrame, qiime2.Metadata
                ):

    # load metadata and tables
    metadata = metadata.to_dataframe()

    # match them
    table, metadata, design = match_and_filter(
        table, metadata,
        formula, training_column, num_random_test_examples,
        min_sample_count, min_feature_count
    )

    # convert to dense representation
    dense_table = table.to_dataframe().to_dense().T

    # split up training and testing
    trainX, testX, trainY, testY = split_training(
        dense_table, metadata, design,
        training_column, num_random_test_examples
    )

    model = MultRegression(learning_rate=learning_rate, clipnorm=clipnorm,
                           beta_mean=beta_prior,
                           batch_size=batch_size,
                           save_path=None)
    with tf.Graph().as_default(), tf.Session() as session:
        model(session, trainX, trainY, testX, testY)

        loss, cv, its = model.fit(
            epoch=epoch,
            summary_interval=summary_interval,
            checkpoint_interval=None)

    md_ids = np.array(design.columns)
    obs_ids = table.ids(axis='observation')

    beta_ = clr(clr_inv(np.hstack((np.zeros((model.p, 1)), model.B))))

    beta_ = pd.DataFrame(
        beta_.T, columns=md_ids, index=obs_ids,
    )
    convergence_stats = pd.DataFrame(
        {
            'loglikehood': loss,
            'cross-validation': cv,
            'iteration': its
        }
    )

    convergence_stats.index.name = 'id'
    convergence_stats.index = convergence_stats.index.astype(np.str)

    c = convergence_stats['loglikehood'].astype(np.float)
    convergence_stats['loglikehood'] = c

    c = convergence_stats['cross-validation'].astype(np.float)
    convergence_stats['cross-validation'] = c

    c = convergence_stats['iteration'].astype(np.int)
    convergence_stats['iteration'] = c

    return beta_, qiime2.Metadata(convergence_stats)
def percentile_normalize(table: biom.Table,
                         metadata: qiime2.MetadataColumn,
                         batch: qiime2.MetadataColumn = None,
                         n_control_thresh: int = 10,
                         otu_thresh: float = 0.3) -> biom.Table:
    """
    Converts an input table with cases and controls into percentiles
    of control samples.

    Parameters
    ----------
    table : biom.Table
        Feature table with relative abundances. Samples are in columns,
        features (i.e. OTUs) are in rows.
    metadata : qiime2.CategoricalMetadataColumn
        metadata column with samples labeled as "case" or "control".
        All samples with either label are returned, normalized to the
        equivalent percentile in "control" samples.
    batch : qiime2.CategoricalMetadataColumn
        metadata column with the different batches labeled. Percentile
        normalization will be performed within each batch, and the output
        tables will be concatenated together. You can use this to normalize
        multiple studies at once by first merging the original feature table,
        adding a study ID column in the merged metadata, and then calling
        percentile normalization with this option.
    n_control_thresh : int [default=10]
        Minimum number of controls accepted to perform percentile
        normalization. Because the transformation converts abundances
        in controls to a uniform distribution, we *highly* discourage
        performing percentile normalization on datasets with fewer than
        30 controls, and certainly not fewer than 10 (the default value).
        If you have fewer controls than `n_control_thresh`, the
        normalization will return an error.
    otu_thresh : float [default=0.3]
        The OTU filtering threshold: OTUs must be present in at least
        otu_thresh fraction of cases OR controls, otherwise it gets thrown
        out and not percentile normalized. This method does not perform
        well with very sparse OTUs, so we do not recommend lowering
        this threshold below 0.3. otu_thresh should be [0, 1]

    Returns
    -------
    norm_biom : biom.Table
        A biom table with the normalized data, only including the samples
        that were labeled as either "case" or "control", and the OTUs
        which passed the otu_thresh threshold.
    """
    # Filter metadata to only include IDs present in the table.
    # Also ensures every distance table ID is present in the metadata.
    metadata = metadata.filter_ids(table.ids(axis='sample'))
    metadata = metadata.drop_missing_values()

    # filter the table to exclude samples that were dropped from
    # the metadata due to missing values
    table = table.filter(metadata.ids)

    metadata = metadata.to_series()

    ## Convert biom Table into dense pandas dataframe
    # Transpose so samples are in rows and OTUs/features in columns
    df = table.to_dataframe().to_dense().T

    # Set up a list of metadata series, one per batch
    batches_to_norm = []
    if batch is not None:
        batch = batch.filter_ids(table.ids(axis='sample'))
        batch = batch.drop_missing_values()
        batch = batch.to_series()
        for g, one_batch in batch.groupby(batch):
            batches_to_norm.append(metadata.loc[one_batch.index])
    else:
        batches_to_norm.append(metadata)

    norm_dfs = []
    for meta in batches_to_norm:
        # Get case and control samples from metadata
        control_samples = meta[meta == "control"].index.tolist()
        case_samples = meta[meta == "case"].index.tolist()

        # Make sure there are enough controls to perform normalization
        if len(control_samples) < n_control_thresh:
            if batch is not None:
                batch_err = (' in batch ' +
                             str(batch.loc[meta.index].unique()[0]) + '')
            else:
                batch_err = ''
            raise ValueError(
                "There aren't enough controls in your data. " + batch_err +
                "(n_control_thresh = {})".format(n_control_thresh))

        # Filter OTUs, replace zeros with random value, and
        # percentile normalize
        norm_df = _percentile_normalize_one_df(df, control_samples,
                                               case_samples, otu_thresh)
        norm_dfs.append(norm_df)

    # Merge all normalized data
    # Keep all samples and all OTUs - OTUs not present in one batch will be NaNs
    norm_df = pd.concat(norm_dfs, axis=1)

    # Put this dataframe into biom format
    norm_biom = biom.Table(data=norm_df.values,
                           observation_ids=norm_df.index,
                           sample_ids=norm_df.columns)

    return norm_biom
示例#8
0
class TestFilters(unittest.TestCase):
    def setUp(self):
        X = np.array([[10, 1, 4, 1, 4, 0], [0, 0, 2, 0, 2, 8],
                      [0, 1, 2, 1, 2, 4], [0, 1, 0, 1, 0,
                                           0], [2, 0, 0, 0, 0, 0],
                      [1, 0, 0, 0, 0, 0], [7, 1, 0, 1, 0, 0]])
        oids = ['o1', 'o2', 'o3', 'o4', 'o5', 'o6', 'o7']
        sids = ['s1', 's2', 's3', 's4', 's5', 's6']

        bigX = np.array([[10, 1, 4, 1, 4, 1, 0], [0, 0, 2, 0, 2, 1, 8],
                         [0, 1, 2, 1, 2, 1, 4], [0, 1, 0, 1, 0, 1, 0],
                         [2, 0, 0, 0, 0, 1, 0], [1, 0, 0, 0, 0, 1, 0],
                         [4, 0, 0, 0, 0, 1, 0]])

        self.big_table = Table(
            bigX,
            oids,
            sids + ['s9'],
        )

        self.metadata = pd.DataFrame(
            np.vstack(
                (np.ones(8), np.array(['a', 'a', 'b', 'b', 'a', 'a', 'b',
                                       'a']), np.arange(8).astype(np.float64),
                 np.array([
                     'Test', 'Test', 'Train', 'Train', 'Train', 'Train',
                     'Test', 'Train'
                 ]))).T,
            columns=['intercept', 'categorical', 'continuous', 'train'],
            index=['s1', 's2', 's3', 's4', 's5', 's6', 's7', 's8'])
        self.metadata['continuous'] = self.metadata['continuous'].astype(
            np.float64)
        self.trimmed_metadata = self.metadata.loc[[
            's1', 's2', 's3', 's4', 's5', 's6'
        ]]
        df = pd.DataFrame([{
            'intercept': 1,
            'categorical': 'b',
            'continuous': 1.,
            'train': 'Train'
        }, {
            'intercept': 1,
            'categorical': 'b',
            'continuous': 1.,
            'train': 'Train'
        }],
                          index=['s2', 's4'])
        df = df.reindex(
            columns=['intercept', 'categorical', 'continuous', 'train'])
        self.metadata_dup = self.metadata.append(df)
        self.table = Table(X, oids, sids)

    def test_match_duplicate(self):
        formula = 'C(categorical) + continuous'
        res = match_and_filter(self.table,
                               self.metadata_dup,
                               formula,
                               min_sample_count=0,
                               min_feature_count=0)
        res_table, res_metadata, res_design = res

        pdt.assert_frame_equal(res_table.to_dataframe(),
                               self.table.to_dataframe())

        exp_metadata = pd.DataFrame(
            np.vstack(
                (np.ones(6), np.array(['a', 'a', 'b', 'b', 'a',
                                       'a']), np.arange(6).astype(np.float64),
                 np.array(['Test', 'Test', 'Train', 'Train', 'Train',
                           'Train']))).T,
            columns=['intercept', 'categorical', 'continuous', 'train'],
            index=['s1', 's2', 's3', 's4', 's5', 's6'])
        exp_metadata['continuous'] = exp_metadata['continuous'].astype(
            np.float64)
        pdt.assert_frame_equal(res_metadata, exp_metadata)
        exp_design = pd.DataFrame(
            np.vstack((np.ones(6), np.array([0, 0, 1, 1, 0, 0]),
                       np.arange(6).astype(np.float64))).T,
            columns=['Intercept', 'C(categorical)[T.b]', 'continuous'],
            index=['s1', 's2', 's3', 's4', 's5', 's6'])

        pdt.assert_frame_equal(res_design, exp_design)

    def test_match_and_filter_no_filter(self):
        formula = 'C(categorical) + continuous'
        res = match_and_filter(self.table,
                               self.metadata,
                               formula,
                               min_sample_count=0,
                               min_feature_count=0)
        res_table, res_metadata, res_design = res

        pdt.assert_frame_equal(res_table.to_dataframe(),
                               self.table.to_dataframe())

        exp_metadata = pd.DataFrame(
            np.vstack(
                (np.ones(6), np.array(['a', 'a', 'b', 'b', 'a',
                                       'a']), np.arange(6).astype(np.float64),
                 np.array(['Test', 'Test', 'Train', 'Train', 'Train',
                           'Train']))).T,
            columns=['intercept', 'categorical', 'continuous', 'train'],
            index=['s1', 's2', 's3', 's4', 's5', 's6'])
        exp_metadata['continuous'] = exp_metadata['continuous'].astype(
            np.float64)
        pdt.assert_frame_equal(res_metadata, exp_metadata)
        exp_design = pd.DataFrame(
            np.vstack((np.ones(6), np.array([0, 0, 1, 1, 0, 0]),
                       np.arange(6).astype(np.float64))).T,
            columns=['Intercept', 'C(categorical)[T.b]', 'continuous'],
            index=['s1', 's2', 's3', 's4', 's5', 's6'])

        pdt.assert_frame_equal(res_design, exp_design)

    def test_match_and_filter_big_table(self):
        formula = 'C(categorical) + continuous'
        res = match_and_filter(self.big_table,
                               self.metadata,
                               formula,
                               min_sample_count=0,
                               min_feature_count=0)

        res_metadata = res[1]
        drop_metadata = res_metadata.dropna()
        res_design = res[2]
        drop_design = res_design.dropna()
        self.assertEqual(res_design.shape[0], drop_design.shape[0])
        self.assertEqual(res_metadata.shape[0], drop_metadata.shape[0])

    def test_split_training_random(self):
        np.random.seed(0)
        design = pd.DataFrame(
            np.vstack((np.ones(6), np.array([0, 0, 1, 1, 0,
                                             0]), np.arange(6))).T,
            columns=['Intercept', 'C(categorical)[T.b]', 'continuous'],
            index=['s1', 's2', 's3', 's4', 's5', 's6'])
        res = split_training(self.table.to_dataframe().T,
                             self.trimmed_metadata,
                             design,
                             training_column=None,
                             num_random_test_examples=2)

        trainX, testX, trainY, testY = res
        # print(trainX.shape, testX.shape, trainY.shape, testY.shape)
        npt.assert_allclose(trainX.shape, np.array([4, 3]))
        npt.assert_allclose(trainY.shape, np.array([4, 7]))

        npt.assert_allclose(testX.shape, np.array([2, 3]))
        npt.assert_allclose(testY.shape, np.array([2, 7]))

    def test_split_training_fixed(self):
        np.random.seed(0)
        design = pd.DataFrame(
            np.vstack((np.ones(6), np.array([0, 0, 1, 1, 0,
                                             0]), np.arange(6))).T,
            columns=['Intercept', 'C(categorical)[T.b]', 'continuous'],
            index=['s1', 's2', 's3', 's4', 's5', 's6'])
        t = self.table.to_dataframe().T
        res = split_training(t,
                             self.metadata,
                             design,
                             training_column='train',
                             num_random_test_examples=2)

        exp_trainX = design.iloc[2:].values
        exp_testX = design.iloc[:2].values
        exp_trainY = t.iloc[2:].values
        exp_testY = t.iloc[:2].values

        res_trainX, res_testX, res_trainY, res_testY = res

        npt.assert_allclose(exp_trainX, res_trainX)
        npt.assert_allclose(exp_trainY, res_trainY)
        npt.assert_allclose(exp_testX, res_testX)
        npt.assert_allclose(exp_testY, res_testY)
示例#9
0
def rpca(
    table: biom.Table,
    n_components: Union[int, str] = DEFAULT_RANK,
    min_sample_count: int = DEFAULT_MSC,
    min_feature_count: int = DEFAULT_MFC,
    min_feature_frequency: float = DEFAULT_MFF,
    max_iterations: int = DEFAULT_ITERATIONS
) -> (skbio.OrdinationResults, skbio.DistanceMatrix):
    """Runs RPCA with an rclr preprocessing step.

       This code will be run by both the standalone and QIIME 2 versions of
       DEICODE.
    """
    # get shape of table
    n_features, n_samples = table.shape

    # filter sample to min seq. depth
    def sample_filter(val, id_, md):
        return sum(val) > min_sample_count

    # filter features to min total counts
    def observation_filter(val, id_, md):
        return sum(val) > min_feature_count

    # filter features by N samples presence
    def frequency_filter(val, id_, md):
        return (np.sum(val > 0) / n_samples) > (min_feature_frequency / 100)

    # filter and import table for each filter above
    table = table.filter(observation_filter, axis='observation')
    table = table.filter(frequency_filter, axis='observation')
    table = table.filter(sample_filter, axis='sample')
    table = table.to_dataframe().T
    # check the table after filtering
    if len(table.index) != len(set(table.index)):
        raise ValueError('Data-table contains duplicate indices')
    if len(table.columns) != len(set(table.columns)):
        raise ValueError('Data-table contains duplicate columns')
    # Robust-clt (rclr) preprocessing and OptSpace (RPCA)
    opt = MatrixCompletion(n_components=n_components,
                           max_iterations=max_iterations).fit(rclr(table))
    # get new n-comp when applicable
    n_components = opt.s.shape[0]
    # get PC column labels for the skbio OrdinationResults
    rename_cols = ['PC' + str(i + 1) for i in range(n_components)]
    # get completed matrix for centering
    X = opt.sample_weights @ opt.s @ opt.feature_weights.T
    # center again around zero after completion
    X = X - X.mean(axis=0)
    X = X - X.mean(axis=1).reshape(-1, 1)
    # re-factor the data
    u, s, v = svd(X)
    # only take n-components
    u = u[:, :n_components]
    v = v.T[:, :n_components]
    # calc. the new variance using projection
    p = s**2 / np.sum(s**2)
    p = p[:n_components]
    s = s[:n_components]
    # save the loadings
    robust_clr = pd.DataFrame(X, index=table.index, columns=table.columns)
    feature_loading = pd.DataFrame(v, index=table.columns, columns=rename_cols)
    sample_loading = pd.DataFrame(u, index=table.index, columns=rename_cols)
    # % var explained
    proportion_explained = pd.Series(p, index=rename_cols)
    # get eigenvalues
    eigvals = pd.Series(s, index=rename_cols)

    # if the n_components is two add PC3 of zeros
    # this is referenced as in issue in
    # <https://github.com/biocore/emperor/commit
    # /a93f029548c421cb0ba365b4294f7a5a6b0209ce>
    # discussed in DEICODE -- PR#29
    if n_components == 2:
        feature_loading['PC3'] = [0] * len(feature_loading.index)
        sample_loading['PC3'] = [0] * len(sample_loading.index)
        eigvals.loc['PC3'] = 0
        proportion_explained.loc['PC3'] = 0

    # save ordination results
    short_method_name = 'rpca_biplot'
    long_method_name = '(Robust Aitchison) RPCA Biplot'
    ord_res = skbio.OrdinationResults(
        short_method_name,
        long_method_name,
        eigvals.copy(),
        samples=sample_loading.copy(),
        features=feature_loading.copy(),
        proportion_explained=proportion_explained.copy())
    # save distance matrix
    dist_res = skbio.stats.distance.DistanceMatrix(opt.distance,
                                                   ids=sample_loading.index)

    return ord_res, dist_res, robust_clr
示例#10
0
def qarcoal(
    table: biom.Table,
    taxonomy: pd.DataFrame,
    num_string: str,
    denom_string: str,
    samples_to_use: Metadata = None,
    allow_shared_features: bool = False,
) -> pd.DataFrame:
    """Calculate sample-wise log-ratios of features based on taxonomy.

    Parameters:
    -----------
        table: biom file with which to calculate log ratios
        taxonomy: pd.DataFrame with taxonomy information (should have Taxon
            column in which features will be searched)
        num_string: numerator string to search for in taxonomy
        denom_string: denominator string to search for in taxonomy
        samples_to_use: Q2 Metadata file with samples to use.
            If provided, feature table will be filtered to only consider
            samples present in this file. (optional)
        allow_shared_features: bool denoting handling of shared features
            between numerator and denominator. If False, an error is raised
            if features are shared between numerator and denominator. If True,
            will allow shared features without throwing an error.
    Returns:
    --------
        comparison_df: pd DataFrame in the form:

            Sample-ID    Num_Sum    Denom_Sum   log_ratio
                   S1          7           15   -0.762140
    """

    # biom table is features x samples
    if samples_to_use is not None:
        filt_samples = set(samples_to_use.to_dataframe().index)
        feat_table = table.filter(filt_samples, axis="sample", inplace=False)
        feat_table = feat_table.to_dataframe()
    else:
        feat_table = table.to_dataframe()

    # raise error if there are any negative counts in the feature table
    if feat_table.lt(0).any().any():
        raise ValueError("Feature table has negative counts!")

    tax_num_df, tax_denom_df = filter_and_join_taxonomy(
        feat_table,
        taxonomy,
        num_string,
        denom_string,
    )

    # if shared features are disallowed, check to make sure they don't occur
    # if allowed, can skip this step at user's risk
    if not allow_shared_features:
        shared_features = set(tax_num_df.index) & set(tax_denom_df.index)
        if shared_features:
            raise ValueError("Shared features between num and denom!")

    tax_num_sample_sum = tax_num_df.sum(axis=0)
    tax_denom_sample_sum = tax_denom_df.sum(axis=0)

    comparison_df = pd.DataFrame.from_records(
        [tax_num_sample_sum, tax_denom_sample_sum],
        index=["Num_Sum", "Denom_Sum"],
    ).T
    comparison_df["log_ratio"] = comparison_df.apply(
        lambda x: np.log(x.Num_Sum / x.Denom_Sum), axis=1
    )
    comparison_df.index.name = "Sample-ID"

    return comparison_df