Exemplo n.º 1
0
    def test_multiplicative_replacement(self):
        amat = multiplicative_replacement(closure(self.cdata3))
        npt.assert_allclose(
            amat,
            np.array([[0.087273, 0.174545, 0.261818, 0.04, 0.436364],
                      [0.092, 0.04, 0.04, 0.368, 0.46],
                      [0.066667, 0.133333, 0.2, 0.266667, 0.333333]]),
            rtol=1e-5,
            atol=1e-5)

        amat = multiplicative_replacement(closure(self.cdata4))
        npt.assert_allclose(
            amat,
            np.array([0.087273, 0.174545, 0.261818, 0.04, 0.436364]),
            rtol=1e-5,
            atol=1e-5)

        amat = multiplicative_replacement(closure(self.cdata6))
        npt.assert_allclose(
            amat,
            np.array([[0.087273, 0.174545, 0.261818, 0.04, 0.436364],
                      [0.092, 0.04, 0.04, 0.368, 0.46],
                      [0.066667, 0.133333, 0.2, 0.266667, 0.333333]]),
            rtol=1e-5,
            atol=1e-5)

        with self.assertRaises(ValueError):
            multiplicative_replacement(self.bad1)
        with self.assertRaises(ValueError):
            multiplicative_replacement(self.bad2)

        # make sure that inplace modification is not occurring
        multiplicative_replacement(self.cdata4)
        npt.assert_allclose(self.cdata4, np.array([1, 2, 3, 0, 5]))
Exemplo n.º 2
0
    def test_multiplicative_replacement(self):
        amat = multiplicative_replacement(closure(self.data3))
        npt.assert_allclose(amat,
                            np.array([[0.087273, 0.174545, 0.261818,
                                       0.04, 0.436364],
                                      [0.092, 0.04, 0.04, 0.368, 0.46],
                                      [0.066667, 0.133333, 0.2,
                                       0.266667, 0.333333]]),
                            rtol=1e-5, atol=1e-5)

        amat = multiplicative_replacement(closure(self.data4))
        npt.assert_allclose(amat,
                            np.array([0.087273, 0.174545, 0.261818,
                                      0.04, 0.436364]),
                            rtol=1e-5, atol=1e-5)

        amat = multiplicative_replacement(closure(self.data6))
        npt.assert_allclose(amat,
                            np.array([[0.087273, 0.174545, 0.261818,
                                       0.04, 0.436364],
                                      [0.092, 0.04, 0.04, 0.368, 0.46],
                                      [0.066667, 0.133333, 0.2,
                                       0.266667, 0.333333]]),
                            rtol=1e-5, atol=1e-5)

        with self.assertRaises(ValueError):
            multiplicative_replacement(self.bad1)
        with self.assertRaises(ValueError):
            multiplicative_replacement(self.bad2)

        # make sure that inplace modification is not occurring
        multiplicative_replacement(self.data4)
        npt.assert_allclose(self.data4, np.array([1, 2, 3, 0, 5]))
Exemplo n.º 3
0
def mult_replace(df):
    """
    wrapper for skbio's multiplicative multiplicative_replacement

    Parameters
    ----------
    df : DataFrame

    Returns
    -------
    df_mr : DataFrame
        modified via multiplicative replacement

    Notes
    -----
    Replaces zeros with the minimum non zero value in the entire
    matrix. Use multiplicaive replacement to ensure rows
    sum close to 1.

    """
    assert (isinstance(df, pd.DataFrame))
    nzra = np.min(df.values.flatten()[df.values.flatten() > 0])
    half_nzra = nzra / 2
    # multiplicative replacement adds small value to non-zero entries while maintaining row sums equal to 1
    df_mr = pd.DataFrame(multiplicative_replacement(df, delta=half_nzra))
    assert (np.all(df_mr.values > 0))
    return df_mr
Exemplo n.º 4
0
    def normalize_transform(self, mode='clr'):
        """
        Some operations may require transformed data.
        This function performs normalization and
        a clr transform on all OTU tables in a Batch object.
        It returns a deep copy of the original Batch object,
        so the original file is not modified.

        :param mode: transformation mode; clr (centered log-ratio) or ilr (isometric log-ratio)
        :return: Transformed copy of Batch object.
        """
        batchcopy = copy.deepcopy(self)
        try:
            for x in list(self.otu):
                # normalizes the data by samples
                normbiom = batchcopy.otu[x].norm(axis='sample', inplace=False)
                mat = csr_matrix.toarray(normbiom.matrix_data)
                # replaces all zeros with a small value
                # multiplicative replacement preserves ratios between values
                mat = multiplicative_replacement(mat)
                if mode is 'clr':
                    mat = clr(mat)
                elif mode is 'ilr':
                    mat = ilr(mat)
                else:
                    raise ValueError("Only CLR and ILR transformations are currently supported.")
                normbiom._data = csc_matrix(mat)
                batchcopy.otu[x] = normbiom
        except Exception:
            logger.error("Failed to normalize data", exc_info=True)
        return batchcopy
Exemplo n.º 5
0
def globalCLRPermTest(otuDf,
                      labels,
                      statfunc=_sumRhoStat,
                      nperms=999,
                      seed=110820,
                      binary=False):
    """Calculates centered-log-ratios (CLR) for each sample and performs global
    permutation tests to determine if there is a significant correlation
    over all log-median-ratios, with respect to the label variable of interest.

    Parameters
    ----------
    otuDf : pd.DataFrame [samples x OTUs]
        Contains relative abundance [0-1] for all samples (rows) and OTUs (colums)
    labels: pd.Series (float)
        Contains binary variable indicating membership into one of two categories
        (e.g. treatment conditions). Must share index with otuDf.
    statfunc : function
        Takes a np.ndarray [n x k] and float index [n] as parameters and
        returns a float summarizing over k.
    nperms : int
        Number of iterations for the permutation test.
    seed :int
        Seed for random permutation generation.
    
    Returns:
    --------
    pvalue : float
        Global p-value for a significant association of OTU log-median-ratios
        with label, based on the summary statistic.
    obs : float
        Statistic summarizing the label difference."""

    nSamples, nOTUs = otuDf.shape

    if binary:
        labelValues = labels.values.astype(bool)
    else:
        labelValues = labels.values.astype(float)

    # Make proportions
    otuDf = otuDf / otuDf.sum()
    # Apply multiplicative replacement for zero values
    otuMR = multiplicative_replacement(otuDf.values)
    # Calculate the CLR
    otuCLR = clr(otuMR)
    # Make into a DataFrame
    otuCLR = pd.DataFrame(otuCLR, index=otuDf.index, columns=otuDf.columns)

    np.random.seed(seed)
    obs = statfunc(otuCLR.values, labelValues)
    samples = np.array([
        statfunc(otuCLR.values, labelValues[np.random.permutation(nSamples)])
        for permi in range(nperms)
    ])
    """Since test is based on the abs statistic it is inherently two-sided"""
    pvalue = ((np.abs(samples) >= np.abs(obs)).sum() + 1) / (nperms + 1)

    return pvalue, obs
Exemplo n.º 6
0
def normalize(df: pd.DataFrame,
              method: str = "am_clr",
              out: str = None,
              force: bool = False) -> pd.DataFrame:
    """Normalize raw k-mer counts by center or isometric log-ratio transform.

    Parameters
    ----------
    df : pd.DataFrame
        k-mer counts dataframe.
        i.e. for 3-mers; Index='contig', columns=[AAA, AAT, ...]
    method : str, optional
        Normalize k-mer counts using CLR or ILR transformation
        (the default is Autometa's CLR implementation).
        choices = ['ilr', 'clr', 'am_clr']
        Other transformations come from the skbio.stats.composition module
    out : str, optional
        Path to write normalized k-mers.
    force : bool, optional
        Whether to overwrite existing `out` file path, by default False.

    Returns
    -------
    pd.DataFrame
        Normalized counts using provided `method`.

    Raises
    ------
    ValueError
        Provided `method` is not available.
    """
    method = method.lower()
    out_specified = out is not None
    out_exists = os.path.exists(out) if out else False
    case1 = out_specified and out_exists and not force
    if case1:
        logger.debug(
            f"{out} already exists. Use force to overwrite. retrieving...")
        return pd.read_csv(out, sep="\t", index_col="contig")
    logger.debug(f"Transforming k-mer counts using {method}")
    choices = {"ilr", "clr", "am_clr"}
    if method == "am_clr":
        norm_df = autometa_clr(df)
    elif method in choices:
        transforms = {"ilr": ilr, "clr": clr}
        X = df.fillna(0).to_numpy()
        X = multiplicative_replacement(X)
        X_norm = transforms[method](X)
        norm_df = pd.DataFrame(X_norm, index=df.index)
    else:
        choices = ", ".join(choices)
        raise ValueError(
            f"Normalize Method not available! {method}. choices: {choices}")
    case2 = out_specified and out_exists and force
    case3 = out_specified and not out_exists
    if case2 or case3:
        norm_df.to_csv(out, sep="\t", index=True, header=True)
    return norm_df
Exemplo n.º 7
0
    def clr_transform_cags_via_mult_rep_method(self):
        """
        NOT GENERALIZABLE - DELETE
        uses multiplicative replacement to replace zeros with half of
        the lowest non-zero relative abundance value. Then performs clr
        transformation.

        Arguments
        ---------
        taxonomic_level : string
            "phlyum" through "species"

        Assigns
        -------
        self.cags_dict : dictionary
        dictionary keyed on 'cags' with the following attributes:
            1. cags_wide_df -  relative abundances
            2. cags_wide_mr_clr_df - clr transformed
               abundances (uses multiplicative replacement)
            3. half_nzra - on-zero relative abundance (NZRA) used for Mult Rep step
        """

        cag_wide = self._pivot_cags()
        # one solution is to use the lowest non-zero relative abundance (NZRA), or more typically NZRA/2
        nzra = np.min(cag_wide.values.flatten()[cag_wide.values.flatten() > 0])
        half_nzra = nzra / 2
        # multiplicative replacement adds small value to non-zero entries while maintaining row sums equal to 1
        cag_wide_mr = multiplicative_replacement(cag_wide, delta=half_nzra)
        # clr transform
        cag_wide_mr_clr = clr(cag_wide_mr)
        # clr transform array to data.frame with index and column matching mp_wide_taxa
        cag_wide_mr_clr_df = pd.DataFrame(cag_wide_mr_clr)
        cag_wide_mr_clr_df.columns = cag_wide.columns
        cag_wide_mr_clr_df.index = cag_wide.index

        self.cags_dict["cags"] = {
            "cags_wide_df": cag_wide,
            "cags_wide_mr_clr_df": cag_wide_mr_clr_df,
            "half_nzra": half_nzra
        }
        return cag_wide_mr_clr_df

        def fetch_metaphlan_result(self, clr=True, taxonomic_level="phylum"):
            """
            getter
            """
            if clr:
                key = 'mp_wide_taxa_mr_clr_df'
            else:
                key = 'mp_wide_taxa_df'
            try:
                return (self.metaphlan_dict[taxonomic_level][key])
            except KeyError:
                print(
                    "NO METAPHLAN MATRIX CREATED SEE clr_transform_metaphlan_via_mult_rep_method()"
                )
Exemplo n.º 8
0
def globalCLRPermTest(otuDf, labels, statfunc=_sumRhoStat, nperms=999, seed=110820, binary=False):
    """Calculates centered-log-ratios (CLR) for each sample and performs global
    permutation tests to determine if there is a significant correlation
    over all log-median-ratios, with respect to the label variable of interest.

    Parameters
    ----------
    otuDf : pd.DataFrame [samples x OTUs]
        Contains relative abundance [0-1] for all samples (rows) and OTUs (colums)
    labels: pd.Series (float)
        Contains binary variable indicating membership into one of two categories
        (e.g. treatment conditions). Must share index with otuDf.
    statfunc : function
        Takes a np.ndarray [n x k] and float index [n] as parameters and
        returns a float summarizing over k.
    nperms : int
        Number of iterations for the permutation test.
    seed :int
        Seed for random permutation generation.
    
    Returns:
    --------
    pvalue : float
        Global p-value for a significant association of OTU log-median-ratios
        with label, based on the summary statistic.
    obs : float
        Statistic summarizing the label difference."""

    nSamples, nOTUs = otuDf.shape

    if binary:
        labelValues = labels.values.astype(bool)
    else:
        labelValues = labels.values.astype(float)

    # Make proportions
    otuDf = otuDf / otuDf.sum()
    # Apply multiplicative replacement for zero values
    otuMR = multiplicative_replacement(otuDf.values)
    # Calculate the CLR
    otuCLR = clr(otuMR)
    # Make into a DataFrame
    otuCLR = pd.DataFrame(otuCLR, index=otuDf.index, columns=otuDf.columns)

    np.random.seed(seed)
    obs = statfunc(otuCLR.values, labelValues)
    samples = np.array([
        statfunc(otuCLR.values, labelValues[np.random.permutation(nSamples)])
        for permi in range(nperms)
    ])
    
    """Since test is based on the abs statistic it is inherently two-sided"""
    pvalue = ((np.abs(samples) >= np.abs(obs)).sum() + 1) / (nperms + 1)

    return pvalue, obs
def normalize_clr(data):

    "replace zeros and apply clr"

    assert data.shape[0]< data.shape[1], "samples should be indexes, I don't think you have"

    normalized=composition.clr(composition.multiplicative_replacement(data))
    normalized= pd.DataFrame(normalized,
                             index= data.index,columns= data.columns)

    return normalized
Exemplo n.º 10
0
 def _clr_transform_via_mult_rep_method(self, df):
     nzra = np.min(df.values.flatten()[df.values.flatten() > 0])
     half_nzra = nzra / 2
     # multiplicative replacement adds small value to non-zero entries while maintaining row sums equal to 1
     df_mr = multiplicative_replacement(df, delta=half_nzra)
     # clr transform
     mr_clr = clr(df_mr)
     # clr transform array to data.frame with index and column matching mp_wide_taxa
     mr_clr_df = pd.DataFrame(mr_clr)
     mr_clr_df.columns = df.columns
     mr_clr_df.index = df.index
     return mr_clr_df
Exemplo n.º 11
0
    def mult_replace(self, df):
        """
        replace zeros with the minimum non zero value in the entire
        matrix. Use multiplicaive replacement to ensure rows
        sum close to 1.s
        """
        nzra = np.min(df.values.flatten()[df.values.flatten() > 0])
        half_nzra = nzra / 2
        # multiplicative replacement adds small value to non-zero entries while maintaining row sums equal to 1
        df_mr = pd.DataFrame(multiplicative_replacement(df, delta=half_nzra))

        return (df_mr)
Exemplo n.º 12
0
def aitchison_transform_part(df):
    """
    Aitchison tranformation on df with all columns belonging to same batch.
    
    df should consist of all samples tagged together in one channel (i.e. A549_S_rep1 etc.)
    """
    df_aitchison = multiplicative_replacement(df)
    #df_aitchison = closure(df)
    df_idx = df.index
    df_col = df.columns
    df_aitchison = pd.DataFrame(df_aitchison, index=df_idx, columns=df_col)
    return df_aitchison
def preprocess_df(df, rep, state):
    """
    Aitchi transformed subset of data.
    """
    df_subset = df[select_rep_state_intensities(rep, state)]
    cols = df_subset.columns
    df_subset = drop_zero_rows(
        df_subset)  #index should be the same as protein/peptides
    index = df_subset.index
    df_subset = multiplicative_replacement(df_subset)
    df_subset = clr(df_subset)
    df_subset = pd.DataFrame(df_subset, index=index, columns=cols)
    return df_subset
Exemplo n.º 14
0
Arquivo: coda.py Projeto: SilasK/CMGM
def clr(counts_data, log=np.log2):

    #TODO: check if count data

    # remove columns with all
    data = counts_data.loc[:, ~(counts_data <= 1).all()]

    #dataframe with replace zeros
    data = pd.DataFrame(composition.multiplicative_replacement(data),
                        columns=data.columns,
                        index=data.index)

    data = log(data)
    data = (data.T - data.mean(1)).T

    return data
Exemplo n.º 15
0
    def clr_transform_metaphlan_via_mult_rep_method(self,
                                                    taxonomic_level="phylum"):
        """
        NOT GENERALIZABLE - DELETE

        uses multiplicative replacement to replace zeros with half of
        the lowest non-zero relative abundance value. Then performs clr
        transformation.

        Arguments
        ---------
        taxonomic_level : string
            "phlyum" through "species"

        Assigns
        -------
        self.metaphlan_dict : dictionary
            dictionary keyed on taxa level with the following attributes:
                1. mp_wide_taxa_df - taxa level relative abundances
                2. mp_wide_taxa_mr_clr_df - taxa level clr transformed
                   abundances (uses multiplicative replacement)
                3. half_nzra - on-zero relative abundance (NZRA) used for Mult Rep step

        """
        mp_wide_taxa = self._pivot_metaphlan(taxonomic_level=taxonomic_level)
        # one solution is to use the lowest non-zero relative abundance (NZRA), or more typically NZRA/2
        nzra = np.min(
            mp_wide_taxa.values.flatten()[mp_wide_taxa.values.flatten() > 0])
        half_nzra = nzra / 2
        # multiplicative replacement adds small value to non-zero entries while maintaining row sums equal to 1
        mp_wide_taxa_mr = multiplicative_replacement(mp_wide_taxa,
                                                     delta=half_nzra)
        # clr transform
        mp_wide_taxa_mr_clr = clr(mp_wide_taxa_mr)
        # clr transform array to data.frame with index and column matching mp_wide_taxa
        mp_wide_taxa_mr_clr_df = pd.DataFrame(mp_wide_taxa_mr_clr)
        mp_wide_taxa_mr_clr_df.columns = mp_wide_taxa.columns
        mp_wide_taxa_mr_clr_df.index = mp_wide_taxa.index

        self.metaphlan_dict[taxonomic_level] = {
            "mp_wide_taxa_df": mp_wide_taxa,
            "mp_wide_taxa_mr_clr_df": mp_wide_taxa_mr_clr_df,
            "half_nzra": half_nzra
        }
        return (mp_wide_taxa_mr_clr_df)
Exemplo n.º 16
0
    def aitchison_distance(self, rank=Rank.Auto):
        """Calculate the Aitchison distance between samples.

        Aitchison distance is the Euclidean distance between centre logratio-normalized samples (abundances).
        As this requires log-transforms, we first need to 'estimate' zeros in the data;
        i.e. replace zeros with small, positive values, while maintaining a constant sum to 1.

        Parameters
        ----------
        rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
            Analysis will be restricted to abundances of taxa at the specified level.

        Returns
        -------
        skbio.stats.distance.DistanceMatrix, a distance matrix.
        """
        import numpy as np
        from skbio.stats.composition import multiplicative_replacement, clr
        from sklearn.metrics.pairwise import euclidean_distances
        from skbio.stats.distance import DistanceMatrix

        df = self.to_df(rank=rank, normalize=self._guess_normalized()
                        )  # get a dataframe of abundances
        df_n0 = multiplicative_replacement(
            df)  # replace 0s with positive small numbers
        df_n0_clr = clr(df_n0)  # clr-normalize
        aitchison_array = euclidean_distances(
            df_n0_clr, df_n0_clr)  # get the euclidean distances

        # Due to rounding differences, we must force mirroring on the matrix
        aitchison_dm = np.zeros(aitchison_array.shape)
        aitchison_dm[np.triu_indices(aitchison_array.shape[0],
                                     k=0)] = aitchison_array[np.triu_indices(
                                         aitchison_array.shape[0], k=0)]
        aitchison_dm = aitchison_dm + aitchison_dm.T - np.diag(
            np.diag(aitchison_dm))
        aitchison_dm = DistanceMatrix(aitchison_dm, df.index)

        return aitchison_dm
Exemplo n.º 17
0
    def cluster_heatmap(self, working_samples, samples_list, tax_level):
        """ saves a cluster heatmap based on Aitchison distance and the y-axis labels"""
        from skbio.stats.composition import clr
        from skbio.stats.composition import multiplicative_replacement
        import seaborn as sns

        if self.abundance_df.groupAbsoluteSamples() is not None:
            data0 = self.abundance_df.groupAbsoluteSamples(
            )[samples_list].astype('int')
            ids = list(data0.columns)
            index0 = list(data0.index)
            data1 = clr(data0.transpose().values.tolist())
            mr_df = multiplicative_replacement(data0.T)
            mr_clr = clr(mr_df)
            mr_clr_df = pd.DataFrame(mr_clr.T, index=index0, columns=ids)

            #g = sns.clustermap(mr_clr_df, metric="correlation", cmap="mako", robust=True, annot_kws={"size": 6})
            g = sns.clustermap(mr_clr_df,
                               metric="euclidean",
                               cmap="mako",
                               robust=True,
                               annot_kws={"size": 6},
                               yticklabels=False)

            filename = self.save_high_resolution_figure(
                g,
                'Select file to save the cluster heatmap',
                'cluster_heatmap',
                defaultextension='.png')
            filename = ('.').join(filename.split('.')[:-1])
            #save y-axis labels
            y_labels = list(data0.iloc[g.dendrogram_row.reordered_ind].index)
            with open(filename + '_yaxis_labels.txt', 'w') as f:
                f.write('\n'.join([x.strip('_') for x in y_labels]))

            import matplotlib.pyplot as plt
            plt.close("all")
Exemplo n.º 18
0
def clr(counts_data,log= np.log2):
    "Convert counts data to centered log ratio with log2. "
    "Zeros are replaced by multiplicative_replacement from scikit-bio. " 
    "See wikipedia for centered log ratio."
    
    from skbio.stats import composition

    #TODO: check if count data
    
    data= counts_data.astype(int)

    # remove columns with all zeros
    data= data.loc[:,~(data<=1).all()]

    #dataframe with replace zeros
    data= pd.DataFrame( composition.multiplicative_replacement(data),
                       columns=data.columns,
                       index= data.index
                      )

    data= log(data)
    data = (data.T-data.mean(1)).T

    return data
Exemplo n.º 19
0
 def multiplicative_replacement_warning(self):
     with self.assertRaises(ValueError):
         multiplicative_replacement([0, 1, 2], delta=1)
Exemplo n.º 20
0
def loadAbundance(filename, compositionNorm=True, truncate=True):
    """Load OTU counts file (phylum, genus or species level)
    with OTUs along the rows and samples along the columns.

    Parameters
    ----------
    filename : str
        Excel file from QIIME pipeline.
        Contains OTUs along the rows and samples along the columns,
        with a few header rows.
    compositionNorm : bool
        Add delta count to zeros and normalize each sample by the
        total number of reads. (uses skbio.stats.composition.multiplicative_replacement)
    truncate : bool
        Discard taxa with less than 0.5% of total reads.
        Discard taxa that are not present in 25% of samples.
        """
    def _cleanCountDf(df):
        """Drop extra columns/headers and transpose so that
        samples are along rows and OTUs along columns.

        Returns
        -------
        outDf : pd.DataFrame [index: samples, columns: OTUs]"""

        df = df.drop(['tax_id', 'rank'], axis = 1)
        df = df.dropna(subset=['tax_name'], axis = 0)
        df = df.rename_axis({'tax_name':'OTU'}, axis=1)
        df = df.set_index('OTU')
        df = df.drop(['specimen'], axis = 0)
        df = df.T
        df = df.dropna(subset=['label'], axis=0)
        df['sid'] = df.label.str.replace('Sample-', 'S')
        df = df.set_index('sid')
        df = df.drop('label', axis=1)
        df = df.astype(float)
        return df

    def _discardLow(df, thresh=0.005):
        """Discard taxa/columns with less than 0.5% of reads"""
        totReads = df.values.sum()
        keepInd1 = (df.sum(axis=0)/totReads) > thresh
        
        """Also discard taxa that are not present in 25% of samples"""
        keepInd2 = (df>0).sum(axis=0)/df.shape[0] > 0.25
        
        return df.loc[:, keepInd1 & keepInd2]
    
    df = pd.read_excel(filename)
    df = _cleanCountDf(df)
        
    if truncate:
        df = _discardLow(df)

    if compositionNorm:
        values = composition.multiplicative_replacement(df.values)
        df = pd.DataFrame(values, columns=df.columns, index=df.index)

    cols = [c for c in df.columns if not c in ['sid']]
    
    print('Abundance data: %s samples, %s taxa' % (df.shape[0], len(cols)))
    return df, cols
def clr_on_subset(df_subset):
    df_subset = drop_zero_rows(df_subset)
    df_subset = multiplicative_replacement(df_subset)
    df_subset = clr(df_subset)
    return df_subset
Exemplo n.º 22
0
#data_corrected = pycombat(df_norm_prot,batch)
data_corrected = pycombat(df_norm.fillna(0),batch[0])







##################### THiNK ABOUT THIS... maybe aitchison before ComBat?
########################################
# Aitchison multiplicative_replacement #
########################################
data_corrected.sum()
df_int.sum()
df_aitchison = multiplicative_replacement(df_int)
df_aitchison = pd.DataFrame(df_int, columns = midx)


def aitchison_transform(df):
    """
    Aitchison tranformation on df.
    
    df should consist of all samples tagged together in one channel (i.e. A549_S_rep1 etc.)
    """
    df_aitchison = multiplicative_replacement(df)
    #df_aitchison = closure(df)
    df_idx = df.index
    df_col = df.columns
    df_aitchison = pd.DataFrame(df_aitchison, index = df_idx, columns = df_col)
    return df_aitchison
Exemplo n.º 23
0
    labs = f.read().split("\t")
# Remove new-line characters that have numbers after them
regex = re.compile(r'\n.*')
labels = [re.sub(regex, "", e) for e in labs]
# Remove first element "x"
labels.pop(0)

# Ensure that this is not the rarefied ASV table
sample_counts = unscaled_tab.sum(axis=1)  # T

# Perform total sum scaling normalization (TSS)
scaled = unscaled_tab.div(unscaled_tab.sum(axis=1), axis=0)
# scaled.sum(axis=1) # check

# Substitute zeros with small pseudocounts since...
zeros_scaled = comp.multiplicative_replacement(scaled)  # numpy.ndarray

# Isoform log transform since...
ilr_transformed = comp.ilr(zeros_scaled)

# Convert ndarray back to dataframe because...
df_ilr_transformed = pd.DataFrame(ilr_transformed,
                                  index=scaled.index,
                                  columns=scaled.columns)

########################################################################################################
# Decision tree methods tended to perform well
# HFE OTU feature reduction method brought a substantial performance improvement for nearly all methods
# After feature reduction most methods performed similarly so need to do that
########################################################################################################
Exemplo n.º 24
0
microbe_iv['group'] = microbe_iv['group'].map(catdict)
metabolite_iv['group'] = metabolite_iv['group'].map(catdict)

# highlight features with p-value <= 0.001
max_pval = 0.001

microbe_iv.loc[microbe_iv.pval > max_pval, 'group'] = 'None'
print('Number of significant microbes: %d' %
      microbe_iv[microbe_iv['group'] != 'None'].shape[0])

metabolite_iv.loc[metabolite_iv.pval > max_pval, 'group'] = 'None'
print('Number of significant metabolites: %d' %
      metabolite_iv[metabolite_iv['group'] != 'None'].shape[0])

plssvd = PLSSVD(n_components=3)
plssvd.fit(X=clr(centralize(multiplicative_replacement(microbes))),
           Y=clr(centralize(multiplicative_replacement(metabolites))))


def standardize(A):
    A = (A - np.mean(A, axis=0)) / np.std(A, axis=0)
    return A


pls_microbes = pd.DataFrame(standardize(plssvd.x_weights_),
                            columns=['PCA1', 'PCA2', 'PCA3'],
                            index=microbes.columns)
pls_metabolites = pd.DataFrame(standardize(plssvd.y_weights_),
                               columns=['PCA1', 'PCA2', 'PCA3'],
                               index=metabolites.columns)
Exemplo n.º 25
0
shortest = dijkstra(dm.values)
shortest = pd.DataFrame(shortest, columns=dm.index, index=dm.columns)
shortest = shortest.reindex_axis(sorted(otu_table.columns), axis=0)
shortest = shortest.reindex_axis(sorted(otu_table.columns), axis=1)
# otu_table = table.T
otu_table = otu_table.reindex_axis(sorted(otu_table.columns), axis=1)

# Uses an idea similar to simrank
graph_dm = (otu_table > 0).dot(cosine).dot((otu_table > 0).T)
graph_dm.to_csv('../results/simrank.txt', '\t')
# Uses Aitchison distance
# samples = ['CF31_A', u'CF31_B', u'CF141_A', u'CF141_B', u'Tuni', u'Bry']
dm = cosine.values
dm[dm == np.inf] = 0
mat = otu_table.values
mat = multiplicative_replacement(mat)
graph_dm = connected_dm(mat, dm)
graph_dm += graph_dm.T
samples = otu_table.index
graph_dm = pd.DataFrame(graph_dm, index=samples, columns=samples)
graph_dm.to_csv('../results/aitchison.txt', '\t')

# Read in graph_dm
graph_dm = pd.read_csv('../results/unconnected_aitchison.txt',
                       sep='\t',
                       index_col=0)
# table = pd.read_table('../data/skinmap_chemiFrac_test.txt',
#                        sep='\t', index_col=0)
graph_dm.index = table.columns
graph_dm.columns = table.columns
# _dm = pw_distances('braycurtis', table.values, table.index.values)
Exemplo n.º 26
0
 def multiplicative_replacement_warning(self):
     with self.assertRaises(ValueError):
         multiplicative_replacement([0, 1, 2], delta=1)
Exemplo n.º 27
0
shortest = pd.DataFrame(shortest,
                        columns=dm.index, index=dm.columns)
shortest = shortest.reindex_axis(sorted(otu_table.columns), axis=0)
shortest = shortest.reindex_axis(sorted(otu_table.columns), axis=1)
# otu_table = table.T
otu_table = otu_table.reindex_axis(sorted(otu_table.columns), axis=1)

# Uses an idea similar to simrank
graph_dm = (otu_table>0).dot(cosine).dot((otu_table>0).T)
graph_dm.to_csv('../results/simrank.txt', '\t')
# Uses Aitchison distance
# samples = ['CF31_A', u'CF31_B', u'CF141_A', u'CF141_B', u'Tuni', u'Bry']
dm = cosine.values
dm[dm==np.inf]=0
mat = otu_table.values
mat = multiplicative_replacement(mat)
graph_dm = connected_dm(mat, dm)
graph_dm += graph_dm.T
samples = otu_table.index
graph_dm = pd.DataFrame(graph_dm,
                        index=samples,
                        columns=samples)
graph_dm.to_csv('../results/aitchison.txt', '\t')

# Read in graph_dm
graph_dm = pd.read_csv('../results/unconnected_aitchison.txt',
                       sep='\t', index_col=0)
# table = pd.read_table('../data/skinmap_chemiFrac_test.txt',
#                        sep='\t', index_col=0)
graph_dm.index = table.columns
graph_dm.columns = table.columns
Exemplo n.º 28
0
# which is much more faster than R package ancom.R::ANCOM

from ancomP.stats.ancom import ancom
import pandas as pd
import numpy as np
from skbio.stats.composition import multiplicative_replacement

p = 20

for j in range(50):
    dir1 = 'H:/Tree/tree_base/p=' str(p) + '/otu_table.' + str(j+1) +'.txt'
    data = open(dir1, 'r')
    tmp=[]
    lines = data.readlines()
    for line in lines:
        line = list(line.strip().split(' '))
        s = []
        for n in line:
            s.append(int(n))
        tmp.append(s)
    data.close()
    dat = multiplicative_replacement(tmp)
    ind = np.arange(1,p+1,1)
    sam = np.arange(1,101,1)
    table = pd.DataFrame(dat, index = sam, columns = ind)
    grouping = pd.Series(sorted([0,1]*50),index = sam)

    results = ancom(table, grouping) # default parameters
    resultsT = results.T
    resultsT.to_csv('H:/Tree/tree_base/p=' + str(p) + '/ANCOM.csv', mode = 'a',header = False)
Exemplo n.º 29
0
def loadAbundance(filename, compositionNorm=True, truncate=True):
    """Load OTU counts file (phylum, genus or species level)
    with OTUs along the rows and samples along the columns.

    Parameters
    ----------
    filename : str
        Excel file from QIIME pipeline.
        Contains OTUs along the rows and samples along the columns,
        with a few header rows.
    compositionNorm : bool
        Add delta count to zeros and normalize each sample by the
        total number of reads. (uses skbio.stats.composition.multiplicative_replacement)
    truncate : bool
        Discard taxa with less than 0.5% of total reads.
        Discard taxa that are not present in 25% of samples.
        """
    def _cleanCountDf(df):
        """Drop extra columns/headers and transpose so that
        samples are along rows and OTUs along columns.

        Returns
        -------
        outDf : pd.DataFrame [index: samples, columns: OTUs]"""

        df = df.drop(['tax_id', 'rank'], axis=1)
        df = df.dropna(subset=['tax_name'], axis=0)
        df = df.rename_axis({'tax_name': 'OTU'}, axis=1)
        df = df.set_index('OTU')
        df = df.drop(['specimen'], axis=0)
        df = df.T
        df = df.dropna(subset=['label'], axis=0)
        df['sid'] = df.label.str.replace('Sample-', 'S')
        df = df.set_index('sid')
        df = df.drop('label', axis=1)
        df = df.astype(float)
        return df

    def _discardLow(df, thresh=0.005):
        """Discard taxa/columns with less than 0.5% of reads"""
        totReads = df.values.sum()
        keepInd1 = (df.sum(axis=0) / totReads) > thresh
        """Also discard taxa that are not present in 25% of samples"""
        keepInd2 = (df > 0).sum(axis=0) / df.shape[0] > 0.25

        return df.loc[:, keepInd1 & keepInd2]

    df = pd.read_excel(filename)
    df = _cleanCountDf(df)

    if truncate:
        df = _discardLow(df)

    if compositionNorm:
        values = composition.multiplicative_replacement(df.values)
        df = pd.DataFrame(values, columns=df.columns, index=df.index)

    cols = [c for c in df.columns if not c in ['sid']]

    print('Abundance data: %s samples, %s taxa' % (df.shape[0], len(cols)))
    return df, cols
Exemplo n.º 30
0
def CLRPermTest(otuDf,
                labels,
                statfunc=_rhoStat,
                nperms=999,
                adjMethod='fdr_bh',
                seed=110820,
                binary=False):
    """Calculates centered-log-ratio (CLR) for all OTUs and performs
    permutation tests to determine if there is a significant correlation
    in OTU ratios with respect to the label variable of interest.

    Parameters
    ----------
    otuDf : pd.DataFrame [samples x OTUs]
        Contains relative abundance [0-1] for all samples (rows) and OTUs (colums)
    labels: pd.Series (float)
        Contains binary variable indicating membership into one of two categories
        (e.g. treatment conditions). Must share index with otuDf.
    statfunc : function
        Takes a np.array [n x k] and float index [n] as parameters and
        returns a 1-D array of the statistic [k].
    nperms : int
        Number of iterations for the permutation test.
    adjMethod : string
        Passed to sm.stats.multipletests for p-value multiplicity adjustment.
        If value is None then no adjustment is made.
    seed :int
        Seed for random permutation generation.
    
    Returns:
    --------
    qvalues : pd.Series [index: OTU]
        Q/P-values for each OTU computed.
    observed : pd.Series [index: OTU]
        Log-ratio statistic summarizing across samples."""

    nSamples, nOTUs = otuDf.shape

    if binary:
        labelValues = labels.values.astype(bool)
    else:
        labelValues = labels.values.astype(float)

    # Make proportions
    otuDf = otuDf / otuDf.sum()
    # Apply multiplicative replacement for zero values
    otuMR = multiplicative_replacement(otuDf.values)
    # Calculate the CLR
    otuCLR = clr(otuMR)
    # Make into a DataFrame
    otuCLR = pd.DataFrame(otuCLR, index=otuDf.index, columns=otuDf.columns)

    obs = statfunc(otuCLR.values, labelValues)

    np.random.seed(seed)
    samples = np.zeros((nperms, nOTUs))

    for permi in range(nperms):
        samples[permi, :] = statfunc(
            otuCLR.values, labelValues[np.random.permutation(nSamples)])

    pvalues = ((np.abs(samples) >= np.abs(obs[None, :])).sum(axis=0) +
               1) / (nperms + 1)

    if adjMethod is None or adjMethod.lower() == 'none':
        qvalues = pvalues
    else:
        qvalues = _pvalueAdjust(pvalues, method=adjMethod)

    qvalues = pd.Series(qvalues, index=otuDf.columns)
    observed = pd.Series(obs, index=otuDf.columns)

    return qvalues, observed
Exemplo n.º 31
0
def CLRPermTest(otuDf, labels, statfunc=_rhoStat, nperms=999, adjMethod='fdr_bh', seed=110820, binary=False):
    """Calculates centered-log-ratio (CLR) for all OTUs and performs
    permutation tests to determine if there is a significant correlation
    in OTU ratios with respect to the label variable of interest.

    Parameters
    ----------
    otuDf : pd.DataFrame [samples x OTUs]
        Contains relative abundance [0-1] for all samples (rows) and OTUs (colums)
    labels: pd.Series (float)
        Contains binary variable indicating membership into one of two categories
        (e.g. treatment conditions). Must share index with otuDf.
    statfunc : function
        Takes a np.array [n x k] and float index [n] as parameters and
        returns a 1-D array of the statistic [k].
    nperms : int
        Number of iterations for the permutation test.
    adjMethod : string
        Passed to sm.stats.multipletests for p-value multiplicity adjustment.
        If value is None then no adjustment is made.
    seed :int
        Seed for random permutation generation.
    
    Returns:
    --------
    qvalues : pd.Series [index: OTU]
        Q/P-values for each OTU computed.
    observed : pd.Series [index: OTU]
        Log-ratio statistic summarizing across samples."""

    nSamples, nOTUs = otuDf.shape

    if binary:
        labelValues = labels.values.astype(bool)
    else:
        labelValues = labels.values.astype(float)

    # Make proportions
    otuDf = otuDf / otuDf.sum()
    # Apply multiplicative replacement for zero values
    otuMR = multiplicative_replacement(otuDf.values)
    # Calculate the CLR
    otuCLR = clr(otuMR)
    # Make into a DataFrame
    otuCLR = pd.DataFrame(otuCLR, index=otuDf.index, columns=otuDf.columns)

    obs = statfunc(otuCLR.values, labelValues)

    np.random.seed(seed)
    samples = np.zeros((nperms, nOTUs))

    for permi in range(nperms):
        samples[permi, :] = statfunc(
            otuCLR.values,
            labelValues[np.random.permutation(nSamples)]
        )

    pvalues = ((np.abs(samples) >= np.abs(obs[None, :])).sum(
        axis=0) + 1) / (nperms + 1)

    if adjMethod is None or adjMethod.lower() == 'none':
        qvalues = pvalues
    else:
        qvalues = _pvalueAdjust(pvalues, method=adjMethod)

    qvalues = pd.Series(qvalues, index=otuDf.columns)
    observed = pd.Series(obs, index=otuDf.columns)

    return qvalues, observed