示例#1
0
def py2rpy_anndata(obj: AnnData) -> RS4:
    with localconverter(default_converter):
        s4v = importr("S4Vectors")
        sce = importr("SingleCellExperiment")
        # TODO: sparse
        x = {} if obj.X is None else dict(X=numpy2ri.py2rpy(obj.X.T))
        layers = {k: numpy2ri.py2rpy(v.T) for k, v in obj.layers.items()}
        assays = ListVector({**x, **layers})

        row_args = {k: pandas2ri.py2rpy(v) for k, v in obj.var.items()}
        if check_no_dupes(obj.var_names, "var_names"):
            row_args["row.names"] = pandas2ri.py2rpy(obj.var_names)
        row_data = s4v.DataFrame(**row_args)

        col_args = {k: pandas2ri.py2rpy(v) for k, v in obj.obs.items()}
        if check_no_dupes(obj.obs_names, "obs_names"):
            col_args["row.names"] = pandas2ri.py2rpy(obj.obs_names)
        col_data = s4v.DataFrame(**col_args)

        # Convert everything we know
        with localconverter(full_converter() + dict_converter):
            metadata = ListVector(obj.uns.items())

        rd_args = {conv_name.scanpy2sce(k): numpy2ri.py2rpy(obj.obsm[k]) for k in obj.obsm.keys()}
        reduced_dims = s4v.SimpleList(**rd_args)

        return sce.SingleCellExperiment(
            assays=assays, rowData=row_data, colData=col_data, metadata=metadata, reducedDims=reduced_dims
        )
    def call_DESeq2(self, count_data, samples, conditions):
        """Call DESeq2.
        @count_data is a DataFrame with 'samples' as the column names.
        @samples is a list. @conditions as well. Condition is the one you're contrasting on.
        You can add additional_conditions (a DataFrame, index = samples) which DESeq2 will
        keep under consideration (changes the formula).
        """
        import rpy2.robjects as robjects
        import rpy2.robjects.numpy2ri as numpy2ri
        import mbf_r

        count_data = count_data.values
        count_data = np.array(count_data)
        nr, nc = count_data.shape
        count_data = count_data.reshape(count_data.size)  # turn into 1d vector
        count_data = robjects.r.matrix(
            numpy2ri.py2rpy(count_data), nrow=nr, ncol=nc, byrow=True
        )
        col_data = pd.DataFrame({"sample": samples, "condition": conditions}).set_index(
            "sample"
        )
        formula = "~ condition"
        col_data = col_data.reset_index(drop=True)
        col_data = mbf_r.convert_dataframe_to_r(pd.DataFrame(col_data.to_dict("list")))
        deseq_experiment = robjects.r("DESeqDataSetFromMatrix")(
            countData=count_data, colData=col_data, design=robjects.Formula(formula)
        )
        deseq_experiment = robjects.r("DESeq")(deseq_experiment)
        res = robjects.r("results")(
            deseq_experiment, contrast=robjects.r("c")("condition", "c", "base")
        )
        df = mbf_r.convert_dataframe_from_r(robjects.r("as.data.frame")(res))
        return df
示例#3
0
def main():

    # Simulate a ton of Ising data
    seed = 110
    n = 10000
    p = 625
    np.random.seed(seed)
    time0 = time.time()
    X, _, _, Q, V = knockadapt.graphs.sample_data(
        n=n,
        p=p,
        x_dist='gibbs',
        method='ising',
    )
    np.fill_diagonal(Q, 1)
    print(f"Took {time.time() - time0} to sim data")

    # Construct sparsity pattern
    sparsity = []
    for i in range(p):
        for j in range(i):
            if Q[i, j] == 0:
                # Remember R is 1 indexed
                # \_O_/ this took me a while to figure out
                sparsity.append((i + 1, j + 1))
    sparsity = np.array(sparsity)

    # Push to R
    Vr = numpy2ri.py2rpy(V)
    sparsityr = numpy2ri.py2rpy(sparsity)

    # Estimate precision matrix using graphical lasso
    glasso = importr('glasso')
    Vglasso = glasso.glasso(Vr, rho=0.01, nobs=n, zero=sparsityr)

    # Extract output and enforce sparsity
    Qest = np.asarray(Vglasso[1])
    Qest[Q == 0] = 0
    Vest = knockadapt.utilities.chol2inv(Qest)
    del V
    del Q

    # Save to output
    vfname = 'vout.txt'
    np.savetxt(vfname, Vest)
    qfname = 'qout.txt'
    np.savetxt(qfname, Qest)
示例#4
0
    def call_edgeR(self, df_counts: DataFrame) -> DataFrame:
        """
        Call to edgeR via r2py to get TMM (trimmed mean of M-values)
        normalization for raw counts.

        Prepare the edgeR input in python and call edgeR calcNormFactors via
        r2py. The TMM normalized values are returned in a DataFrame which
        is converted back to pandas DataFrame via r2py.

        Parameters
        ----------
        df_counts : DataFrame
            The dataframe containing the raw counts.

        Returns
        -------
        DataFrame
            A dataframe with TMM values (trimmed mean of M-values).
        """
        ro.r("library(edgeR)")
        ro.r("library(base)")
        df_input = df_counts
        columns = df_input.columns
        to_df = {"lib.size": df_input.sum(axis=0).values}
        if self.samples_to_group is not None:
            to_df["group"] = [
                self.samples_to_group[sample_name]
                for sample_name in self.samples_to_group
            ]
        if self.batch is not None:
            to_df["batch"] = self.batch
        df_samples = pd.DataFrame(to_df)
        df_samples["lib.size"] = df_samples["lib.size"].astype(int)
        r_counts = mbf_r.convert_dataframe_to_r(df_input)
        r_samples = mbf_r.convert_dataframe_to_r(df_samples)
        y = ro.r("DGEList")(
            counts=r_counts,
            samples=r_samples,
        )
        # apply TMM normalization
        y = ro.r("calcNormFactors")(y)  # default is TMM
        logtmm = ro.r("""function(y){
                cpm(y, log=TRUE, prior.count=5)
                }""")(
            y
        )  # apparently removeBatchEffects works better on log2-transformed values
        if self.batch is not None:
            batches = np.array(self.batch)
            batches = numpy2ri.py2rpy(batches)
            logtmm = ro.r("""
                function(logtmm, batch) {
                    tmm = removeBatchEffect(logtmm,batch=batch)
                }
                """)(logtmm=logtmm, batch=batches)
        cpm = ro.r("data.frame")(logtmm)
        df = mbf_r.convert_dataframe_from_r(cpm)
        df = df.reset_index(drop=True)
        df.columns = columns
        return df
示例#5
0
def _pruning(X , G, pruneMethod = robjects.r.selGam,
      pruneMethodPars = ListVector({'cutOffPVal': 0.001, 'numBasisFcts': 10}), output = False):
    # X is a r matrix
    # G is a python numpy array adj matrix,

    d = G.shape[0]
    X = robjects.r.matrix(numpy2ri.py2rpy(X), ncol=d)
    G = robjects.r.matrix(numpy2ri.py2rpy(G), d, d)
    finalG = robjects.r.matrix(0, d, d)
    for i in range(d):
        parents = robjects.r.which(G.rx(True, i + 1).ro == 1)
        lenpa = robjects.r.length(parents)[0]
        if lenpa > 0:
            Xtmp = robjects.r.cbind(X.rx(True, parents), X.rx(True, i+1))
            selectedPar = pruneMethod(Xtmp, k = lenpa + 1, pars = pruneMethodPars, output = output)
            finalParents = parents.rx(selectedPar)
            finalG.rx[finalParents, i+1] = 1

    return np.array(finalG)
示例#6
0
def test_cpm_normalization():
    given = np.array([
        [5, 4, 3],
        [2, 1, 4],
        [3, 4, 6],
        [4, 2, 8],
    ])
    from rpy2.robjects import numpy2ri
    from rpy2.robjects.packages import importr
    edgeR = importr('edgeR')
    expectation = numpy2ri.rpy2py(edgeR.cpm(numpy2ri.py2rpy(given)))
    assert np.allclose(cpm_normalize(given), expectation, atol=1e-2)
    def edgeR_comparison(
        self, df, columns_a, columns_b, library_sizes=None, manual_dispersion_value=0.4
    ):
        """Call edgeR exactTest comparing two groups.
        Resulting dataframe is in df order.
        """
        import mbf_r
        import math
        import rpy2.robjects as ro
        import rpy2.robjects.numpy2ri as numpy2ri

        ro.r("library(edgeR)")
        input_df = df[columns_a + columns_b]
        input_df.columns = ["X_%i" % x for x in range(len(input_df.columns))]
        if library_sizes is not None:  # pragma: no cover
            samples = pd.DataFrame({"lib.size": library_sizes})
        else:
            samples = pd.DataFrame({"lib.size": input_df.sum(axis=0)})
        # this looks like it inverts the columns,
        # but it doesnt'
        samples.insert(0, "group", ["z"] * len(columns_b) + ["x"] * len(columns_a))
        r_counts = mbf_r.convert_dataframe_to_r(input_df)
        r_samples = mbf_r.convert_dataframe_to_r(samples)
        y = ro.r("DGEList")(
            counts=r_counts,
            samples=r_samples,
            **{
                "lib.size": ro.r("as.vector")(
                    numpy2ri.py2rpy(np.array(samples["lib.size"]))
                )
            },
        )
        # apply TMM normalization
        y = ro.r("calcNormFactors")(y)
        if len(columns_a) == 1 and len(columns_b) == 1:  # pragma: no cover
            # not currently used.
            z = manual_dispersion_value
            e = ro.r("exactTest")(y, dispersion=math.pow(manual_dispersion_value, 2))
            """
            you are attempting to estimate dispersions without any replicates.
            Since this is not possible, there are several inferior workarounds to come up with something
            still semi-useful.
            1. pick a reasonable dispersion value from "Experience": 0.4 for humans, 0.1 for genetically identical model organisms, 0.01 for technical replicates. We'll try this for now.
            2. estimate dispersions on a number of genes that you KNOW to be not differentially expressed.
            3. In case of multiple factor experiments, discard the least important factors and treat the samples as replicates.
            4. just use logFC and forget about significance.
            """
        else:
            z = ro.r("estimateDisp")(y, robust=True)
            e = ro.r("exactTest")(z)
        res = ro.r("topTags")(e, n=len(input_df), **{"sort.by": "none"})
        result = mbf_r.convert_dataframe_from_r(res[0])
        return result
    def edgeR_comparison(
        self, df, columns_a, columns_b, library_sizes=None, manual_dispersion_value=0.4
    ):
        """Call edgeR exactTest comparing two groups.
        Resulting dataframe is in df order.
        """
        import mbf_r
        import rpy2.robjects as ro
        import rpy2.robjects.numpy2ri as numpy2ri

        if len(columns_a) != len(columns_b):
            raise ValueError("paired requires equal length groups")

        ro.r("library(edgeR)")
        input_df = df[columns_a + columns_b]
        input_df.columns = ["X_%i" % x for x in range(len(input_df.columns))]
        if library_sizes is not None:  # pragma: no cover
            samples = pd.DataFrame({"lib.size": library_sizes})
        else:
            samples = pd.DataFrame({"lib.size": input_df.sum(axis=0)})
        # remember, edgeR does b-a not a-b...
        samples.insert(0, "group", ["z"] * len(columns_b) + ["y"] * len(columns_a))
        samples.insert(
            1,
            "pairs",
            [str(x) for x in list(range(len(columns_a))) + list(range(len(columns_a)))],
        )

        r_counts = mbf_r.convert_dataframe_to_r(input_df)
        r_samples = mbf_r.convert_dataframe_to_r(samples)
        design = ro.r("model.matrix")(ro.r("~pairs+group"), data=r_samples)
        y = ro.r("DGEList")(
            counts=r_counts,
            samples=r_samples,
            **{
                "lib.size": ro.r("as.vector")(
                    numpy2ri.py2rpy(np.array(samples["lib.size"]))
                )
            },
        )
        # apply TMM normalization
        y = ro.r("calcNormFactors")(y)
        z = ro.r("estimateDisp")(y, design, robust=True)
        fit = ro.r("glmFit")(z, design)
        lrt = ro.r("glmLRT")(fit)
        res = ro.r("topTags")(lrt, n=len(input_df), **{"sort.by": "none"})
        result = mbf_r.convert_dataframe_from_r(res[0])
        return result
示例#9
0
文件: r.py 项目: malei-pku/navis
def xform_brain(x: Union['core.NeuronObject', 'pd.DataFrame', 'np.ndarray'],
                source: str,
                target: str,
                fallback: Optional[Literal['AFFINE']] = None,
                **kwargs) -> Union['core.NeuronObject',
                                   'pd.DataFrame',
                                   'np.ndarray']:
    """ Transform 3D data between template brains. This is just a wrapper for
    ``nat.templatebrains:xform_brain``.

    Parameters
    ----------
    x :         Neuron/List | numpy.ndarray | pandas.DataFrame
                Data to transform. Dataframe must contain ``['x', 'y', 'z']``
                columns. Numpy array must be shape ``(N, 3)``.
    source :    str
                Source template brain that the data currently is in.
    target :    str
                Target template brain that the data should be transformed into.
    fallback :  None | "AFFINE",
                If "AFFINE", will fall back to affine transformation if CMTK
                transformation fails. Else coordinates of points for which the
                transformation failed (e.g. b/c they are out of bounds), will
                be returned as ``None``.
    **kwargs
                Keyword arguments passed to ``nat.templatebrains:xform_brain``

    Returns
    -------
    same type as ``x``
                Copy of input with transformed coordinates.
    """

    if not isinstance(x, (core.TreeNeuron, np.ndarray, pd.DataFrame)):
        raise TypeError(f'Unable to transform data of type "{type(x)}"')

    if isinstance(x, core.TreeNeuron):
        x = x.copy()
        x.nodes = xform_brain(x.nodes, source, target)
        if x.has_connectors:
            x.connectors = xform_brain(x.connectors, source, target)
        return x
    elif isinstance(x, pd.DataFrame):
        if any([c not in x.columns for c in ['x', 'y', 'z']]):
            raise ValueError('DataFrame must have x, y and z columns.')
        x = x.copy()
        x.loc[:, ['x', 'y', 'z']] = xform_brain(x[['x', 'y', 'z']].values.astype(float),
                                                source, target)
        return x
    elif x.shape[1] != 3:
        raise ValueError('Array must be of shape (N, 3).')

    if isinstance(source, str):
        source = robjects.r(source)
    else:
        TypeError(f'Expected source of type str, got "{type(source)}"')

    if isinstance(target, str):
        target = robjects.r(target)
    else:
        TypeError(f'Expected target of type str, got "{type(target)}"')

    # We need to convert numpy arrays explicitly
    if isinstance(x, np.ndarray):
        x = numpy2ri.py2rpy(x)

    xf = nat_templatebrains.xform_brain(x,
                                        sample=source,
                                        reference=target,
                                        FallBackToAffine=fallback == 'AFFINE',
                                        **kwargs)

    return np.array(xf)
示例#10
0
def CAM(XX, pns_type=None, pns_thres=None, adj_after_pns=None, pruning_type=None):
    # XX is a numpy array

    string = '''
    asSparseMatrix <- function(d){
        return(as(matrix(0, d, d), "sparseMatrix"))
    }

    whichMax <- function(input){
        return(which.max(input))
    }
    '''
    selfpack = SignatureTranslatedAnonymousPackage(string, "selfpack")

    n, d = XX.shape
    maxNumParents = min(d - 1, round(n / 20))
    X = numpy2ri.py2rpy(XX)

    if pns_type != None:
        if pns_thres != None & pns_thres >= 0 & pns_thres <= 1:
            selMat = pns_type(X, pns_thres=pns_thres, verbose=False)
        else:
            raise ValueError
    else:
        if adj_after_pns == None:
            selMat = np.ones((d, d))
        else:
            selMat = adj_after_pns

    computeScoreMatTmp = robjects.r.computeScoreMat(X, scoreName='SEMGAM',
                                                    numParents=1, numCores=1, output=False,
                                                    selMat=numpy2ri.py2rpy(selMat),
                                                    parsScore=ListVector({'numBasisFcts': 10}), intervMat=float('nan'),
                                                    intervData=False)
    scoreVec = []
    edgeList = []
    pathMatrix = robjects.r.matrix(0, d, d)
    # Adj = selfpack.asSparseMatrix(d)
    Adj = robjects.r.matrix(0, d, d)
    scoreNodes = computeScoreMatTmp.rx('scoreEmptyNodes')[0]
    scoreMat = computeScoreMatTmp.rx('scoreMat')[0]
    counterUpdate = 0
    while (sum(scoreMat.ro != -float('inf')) > 0):
        # print(sum(scoreMat.ro != -float('inf')))
        ix_max = robjects.r.arrayInd(selfpack.whichMax(scoreMat),
                                     robjects.r.dim(scoreMat))
        ix_max_backward = robjects.r.matrix(IntVector([ix_max[1], ix_max[0]]), 1, 2)
        Adj.rx[ix_max] = 1
        scoreNodes.rx[ix_max[1]] = scoreNodes.rx(ix_max[1]).ro + scoreMat.rx(ix_max)
        scoreMat.rx[ix_max] = -float('inf')
        pathMatrix.rx[ix_max[0], ix_max[1]] = 1
        DescOfNewChild = robjects.r.which(pathMatrix.rx(ix_max[1], True).ro == 1)
        AncOfNewChild = robjects.r.which(pathMatrix.rx(True, ix_max[0]).ro == 1)
        pathMatrix.rx[AncOfNewChild, DescOfNewChild] = 1
        scoreMat.rx[robjects.r.t(pathMatrix).ro == 1] = -float('inf')
        scoreMat.rx[ix_max[1], ix_max[0]] = -float('inf')
        scoreVec.append(sum(scoreNodes))
        edgeList.append(list(ix_max))
        scoreMat = robjects.r.updateScoreMat(scoreMat, X, scoreName='SEMGAM', i=ix_max[0], j=ix_max[1],
                                             scoreNodes=scoreNodes, Adj=Adj, numCores=1, output=False,
                                             maxNumParents=maxNumParents, parsScore=ListVector({'numBasisFcts': 10}),
                                             intervMat=float('nan'), intervData=False)
        counterUpdate = counterUpdate + 1

    if pruning_type != None:
        # Adj is the out put
        pass

    return np.array(Adj)
示例#11
0
def pruning_cam(XX, Adj):
    X2 = numpy2ri.py2rpy(XX)
    Adj = _pruning(X = X2, G = Adj, pruneMethod = robjects.r.selGam,
      pruneMethodPars = ListVector({'cutOffPVal': 0.001, 'numBasisFcts': 10}), output = False)

    return Adj