def py2rpy_anndata(obj: AnnData) -> RS4: with localconverter(default_converter): s4v = importr("S4Vectors") sce = importr("SingleCellExperiment") # TODO: sparse x = {} if obj.X is None else dict(X=numpy2ri.py2rpy(obj.X.T)) layers = {k: numpy2ri.py2rpy(v.T) for k, v in obj.layers.items()} assays = ListVector({**x, **layers}) row_args = {k: pandas2ri.py2rpy(v) for k, v in obj.var.items()} if check_no_dupes(obj.var_names, "var_names"): row_args["row.names"] = pandas2ri.py2rpy(obj.var_names) row_data = s4v.DataFrame(**row_args) col_args = {k: pandas2ri.py2rpy(v) for k, v in obj.obs.items()} if check_no_dupes(obj.obs_names, "obs_names"): col_args["row.names"] = pandas2ri.py2rpy(obj.obs_names) col_data = s4v.DataFrame(**col_args) # Convert everything we know with localconverter(full_converter() + dict_converter): metadata = ListVector(obj.uns.items()) rd_args = {conv_name.scanpy2sce(k): numpy2ri.py2rpy(obj.obsm[k]) for k in obj.obsm.keys()} reduced_dims = s4v.SimpleList(**rd_args) return sce.SingleCellExperiment( assays=assays, rowData=row_data, colData=col_data, metadata=metadata, reducedDims=reduced_dims )
def call_DESeq2(self, count_data, samples, conditions): """Call DESeq2. @count_data is a DataFrame with 'samples' as the column names. @samples is a list. @conditions as well. Condition is the one you're contrasting on. You can add additional_conditions (a DataFrame, index = samples) which DESeq2 will keep under consideration (changes the formula). """ import rpy2.robjects as robjects import rpy2.robjects.numpy2ri as numpy2ri import mbf_r count_data = count_data.values count_data = np.array(count_data) nr, nc = count_data.shape count_data = count_data.reshape(count_data.size) # turn into 1d vector count_data = robjects.r.matrix( numpy2ri.py2rpy(count_data), nrow=nr, ncol=nc, byrow=True ) col_data = pd.DataFrame({"sample": samples, "condition": conditions}).set_index( "sample" ) formula = "~ condition" col_data = col_data.reset_index(drop=True) col_data = mbf_r.convert_dataframe_to_r(pd.DataFrame(col_data.to_dict("list"))) deseq_experiment = robjects.r("DESeqDataSetFromMatrix")( countData=count_data, colData=col_data, design=robjects.Formula(formula) ) deseq_experiment = robjects.r("DESeq")(deseq_experiment) res = robjects.r("results")( deseq_experiment, contrast=robjects.r("c")("condition", "c", "base") ) df = mbf_r.convert_dataframe_from_r(robjects.r("as.data.frame")(res)) return df
def main(): # Simulate a ton of Ising data seed = 110 n = 10000 p = 625 np.random.seed(seed) time0 = time.time() X, _, _, Q, V = knockadapt.graphs.sample_data( n=n, p=p, x_dist='gibbs', method='ising', ) np.fill_diagonal(Q, 1) print(f"Took {time.time() - time0} to sim data") # Construct sparsity pattern sparsity = [] for i in range(p): for j in range(i): if Q[i, j] == 0: # Remember R is 1 indexed # \_O_/ this took me a while to figure out sparsity.append((i + 1, j + 1)) sparsity = np.array(sparsity) # Push to R Vr = numpy2ri.py2rpy(V) sparsityr = numpy2ri.py2rpy(sparsity) # Estimate precision matrix using graphical lasso glasso = importr('glasso') Vglasso = glasso.glasso(Vr, rho=0.01, nobs=n, zero=sparsityr) # Extract output and enforce sparsity Qest = np.asarray(Vglasso[1]) Qest[Q == 0] = 0 Vest = knockadapt.utilities.chol2inv(Qest) del V del Q # Save to output vfname = 'vout.txt' np.savetxt(vfname, Vest) qfname = 'qout.txt' np.savetxt(qfname, Qest)
def call_edgeR(self, df_counts: DataFrame) -> DataFrame: """ Call to edgeR via r2py to get TMM (trimmed mean of M-values) normalization for raw counts. Prepare the edgeR input in python and call edgeR calcNormFactors via r2py. The TMM normalized values are returned in a DataFrame which is converted back to pandas DataFrame via r2py. Parameters ---------- df_counts : DataFrame The dataframe containing the raw counts. Returns ------- DataFrame A dataframe with TMM values (trimmed mean of M-values). """ ro.r("library(edgeR)") ro.r("library(base)") df_input = df_counts columns = df_input.columns to_df = {"lib.size": df_input.sum(axis=0).values} if self.samples_to_group is not None: to_df["group"] = [ self.samples_to_group[sample_name] for sample_name in self.samples_to_group ] if self.batch is not None: to_df["batch"] = self.batch df_samples = pd.DataFrame(to_df) df_samples["lib.size"] = df_samples["lib.size"].astype(int) r_counts = mbf_r.convert_dataframe_to_r(df_input) r_samples = mbf_r.convert_dataframe_to_r(df_samples) y = ro.r("DGEList")( counts=r_counts, samples=r_samples, ) # apply TMM normalization y = ro.r("calcNormFactors")(y) # default is TMM logtmm = ro.r("""function(y){ cpm(y, log=TRUE, prior.count=5) }""")( y ) # apparently removeBatchEffects works better on log2-transformed values if self.batch is not None: batches = np.array(self.batch) batches = numpy2ri.py2rpy(batches) logtmm = ro.r(""" function(logtmm, batch) { tmm = removeBatchEffect(logtmm,batch=batch) } """)(logtmm=logtmm, batch=batches) cpm = ro.r("data.frame")(logtmm) df = mbf_r.convert_dataframe_from_r(cpm) df = df.reset_index(drop=True) df.columns = columns return df
def _pruning(X , G, pruneMethod = robjects.r.selGam, pruneMethodPars = ListVector({'cutOffPVal': 0.001, 'numBasisFcts': 10}), output = False): # X is a r matrix # G is a python numpy array adj matrix, d = G.shape[0] X = robjects.r.matrix(numpy2ri.py2rpy(X), ncol=d) G = robjects.r.matrix(numpy2ri.py2rpy(G), d, d) finalG = robjects.r.matrix(0, d, d) for i in range(d): parents = robjects.r.which(G.rx(True, i + 1).ro == 1) lenpa = robjects.r.length(parents)[0] if lenpa > 0: Xtmp = robjects.r.cbind(X.rx(True, parents), X.rx(True, i+1)) selectedPar = pruneMethod(Xtmp, k = lenpa + 1, pars = pruneMethodPars, output = output) finalParents = parents.rx(selectedPar) finalG.rx[finalParents, i+1] = 1 return np.array(finalG)
def test_cpm_normalization(): given = np.array([ [5, 4, 3], [2, 1, 4], [3, 4, 6], [4, 2, 8], ]) from rpy2.robjects import numpy2ri from rpy2.robjects.packages import importr edgeR = importr('edgeR') expectation = numpy2ri.rpy2py(edgeR.cpm(numpy2ri.py2rpy(given))) assert np.allclose(cpm_normalize(given), expectation, atol=1e-2)
def edgeR_comparison( self, df, columns_a, columns_b, library_sizes=None, manual_dispersion_value=0.4 ): """Call edgeR exactTest comparing two groups. Resulting dataframe is in df order. """ import mbf_r import math import rpy2.robjects as ro import rpy2.robjects.numpy2ri as numpy2ri ro.r("library(edgeR)") input_df = df[columns_a + columns_b] input_df.columns = ["X_%i" % x for x in range(len(input_df.columns))] if library_sizes is not None: # pragma: no cover samples = pd.DataFrame({"lib.size": library_sizes}) else: samples = pd.DataFrame({"lib.size": input_df.sum(axis=0)}) # this looks like it inverts the columns, # but it doesnt' samples.insert(0, "group", ["z"] * len(columns_b) + ["x"] * len(columns_a)) r_counts = mbf_r.convert_dataframe_to_r(input_df) r_samples = mbf_r.convert_dataframe_to_r(samples) y = ro.r("DGEList")( counts=r_counts, samples=r_samples, **{ "lib.size": ro.r("as.vector")( numpy2ri.py2rpy(np.array(samples["lib.size"])) ) }, ) # apply TMM normalization y = ro.r("calcNormFactors")(y) if len(columns_a) == 1 and len(columns_b) == 1: # pragma: no cover # not currently used. z = manual_dispersion_value e = ro.r("exactTest")(y, dispersion=math.pow(manual_dispersion_value, 2)) """ you are attempting to estimate dispersions without any replicates. Since this is not possible, there are several inferior workarounds to come up with something still semi-useful. 1. pick a reasonable dispersion value from "Experience": 0.4 for humans, 0.1 for genetically identical model organisms, 0.01 for technical replicates. We'll try this for now. 2. estimate dispersions on a number of genes that you KNOW to be not differentially expressed. 3. In case of multiple factor experiments, discard the least important factors and treat the samples as replicates. 4. just use logFC and forget about significance. """ else: z = ro.r("estimateDisp")(y, robust=True) e = ro.r("exactTest")(z) res = ro.r("topTags")(e, n=len(input_df), **{"sort.by": "none"}) result = mbf_r.convert_dataframe_from_r(res[0]) return result
def edgeR_comparison( self, df, columns_a, columns_b, library_sizes=None, manual_dispersion_value=0.4 ): """Call edgeR exactTest comparing two groups. Resulting dataframe is in df order. """ import mbf_r import rpy2.robjects as ro import rpy2.robjects.numpy2ri as numpy2ri if len(columns_a) != len(columns_b): raise ValueError("paired requires equal length groups") ro.r("library(edgeR)") input_df = df[columns_a + columns_b] input_df.columns = ["X_%i" % x for x in range(len(input_df.columns))] if library_sizes is not None: # pragma: no cover samples = pd.DataFrame({"lib.size": library_sizes}) else: samples = pd.DataFrame({"lib.size": input_df.sum(axis=0)}) # remember, edgeR does b-a not a-b... samples.insert(0, "group", ["z"] * len(columns_b) + ["y"] * len(columns_a)) samples.insert( 1, "pairs", [str(x) for x in list(range(len(columns_a))) + list(range(len(columns_a)))], ) r_counts = mbf_r.convert_dataframe_to_r(input_df) r_samples = mbf_r.convert_dataframe_to_r(samples) design = ro.r("model.matrix")(ro.r("~pairs+group"), data=r_samples) y = ro.r("DGEList")( counts=r_counts, samples=r_samples, **{ "lib.size": ro.r("as.vector")( numpy2ri.py2rpy(np.array(samples["lib.size"])) ) }, ) # apply TMM normalization y = ro.r("calcNormFactors")(y) z = ro.r("estimateDisp")(y, design, robust=True) fit = ro.r("glmFit")(z, design) lrt = ro.r("glmLRT")(fit) res = ro.r("topTags")(lrt, n=len(input_df), **{"sort.by": "none"}) result = mbf_r.convert_dataframe_from_r(res[0]) return result
def xform_brain(x: Union['core.NeuronObject', 'pd.DataFrame', 'np.ndarray'], source: str, target: str, fallback: Optional[Literal['AFFINE']] = None, **kwargs) -> Union['core.NeuronObject', 'pd.DataFrame', 'np.ndarray']: """ Transform 3D data between template brains. This is just a wrapper for ``nat.templatebrains:xform_brain``. Parameters ---------- x : Neuron/List | numpy.ndarray | pandas.DataFrame Data to transform. Dataframe must contain ``['x', 'y', 'z']`` columns. Numpy array must be shape ``(N, 3)``. source : str Source template brain that the data currently is in. target : str Target template brain that the data should be transformed into. fallback : None | "AFFINE", If "AFFINE", will fall back to affine transformation if CMTK transformation fails. Else coordinates of points for which the transformation failed (e.g. b/c they are out of bounds), will be returned as ``None``. **kwargs Keyword arguments passed to ``nat.templatebrains:xform_brain`` Returns ------- same type as ``x`` Copy of input with transformed coordinates. """ if not isinstance(x, (core.TreeNeuron, np.ndarray, pd.DataFrame)): raise TypeError(f'Unable to transform data of type "{type(x)}"') if isinstance(x, core.TreeNeuron): x = x.copy() x.nodes = xform_brain(x.nodes, source, target) if x.has_connectors: x.connectors = xform_brain(x.connectors, source, target) return x elif isinstance(x, pd.DataFrame): if any([c not in x.columns for c in ['x', 'y', 'z']]): raise ValueError('DataFrame must have x, y and z columns.') x = x.copy() x.loc[:, ['x', 'y', 'z']] = xform_brain(x[['x', 'y', 'z']].values.astype(float), source, target) return x elif x.shape[1] != 3: raise ValueError('Array must be of shape (N, 3).') if isinstance(source, str): source = robjects.r(source) else: TypeError(f'Expected source of type str, got "{type(source)}"') if isinstance(target, str): target = robjects.r(target) else: TypeError(f'Expected target of type str, got "{type(target)}"') # We need to convert numpy arrays explicitly if isinstance(x, np.ndarray): x = numpy2ri.py2rpy(x) xf = nat_templatebrains.xform_brain(x, sample=source, reference=target, FallBackToAffine=fallback == 'AFFINE', **kwargs) return np.array(xf)
def CAM(XX, pns_type=None, pns_thres=None, adj_after_pns=None, pruning_type=None): # XX is a numpy array string = ''' asSparseMatrix <- function(d){ return(as(matrix(0, d, d), "sparseMatrix")) } whichMax <- function(input){ return(which.max(input)) } ''' selfpack = SignatureTranslatedAnonymousPackage(string, "selfpack") n, d = XX.shape maxNumParents = min(d - 1, round(n / 20)) X = numpy2ri.py2rpy(XX) if pns_type != None: if pns_thres != None & pns_thres >= 0 & pns_thres <= 1: selMat = pns_type(X, pns_thres=pns_thres, verbose=False) else: raise ValueError else: if adj_after_pns == None: selMat = np.ones((d, d)) else: selMat = adj_after_pns computeScoreMatTmp = robjects.r.computeScoreMat(X, scoreName='SEMGAM', numParents=1, numCores=1, output=False, selMat=numpy2ri.py2rpy(selMat), parsScore=ListVector({'numBasisFcts': 10}), intervMat=float('nan'), intervData=False) scoreVec = [] edgeList = [] pathMatrix = robjects.r.matrix(0, d, d) # Adj = selfpack.asSparseMatrix(d) Adj = robjects.r.matrix(0, d, d) scoreNodes = computeScoreMatTmp.rx('scoreEmptyNodes')[0] scoreMat = computeScoreMatTmp.rx('scoreMat')[0] counterUpdate = 0 while (sum(scoreMat.ro != -float('inf')) > 0): # print(sum(scoreMat.ro != -float('inf'))) ix_max = robjects.r.arrayInd(selfpack.whichMax(scoreMat), robjects.r.dim(scoreMat)) ix_max_backward = robjects.r.matrix(IntVector([ix_max[1], ix_max[0]]), 1, 2) Adj.rx[ix_max] = 1 scoreNodes.rx[ix_max[1]] = scoreNodes.rx(ix_max[1]).ro + scoreMat.rx(ix_max) scoreMat.rx[ix_max] = -float('inf') pathMatrix.rx[ix_max[0], ix_max[1]] = 1 DescOfNewChild = robjects.r.which(pathMatrix.rx(ix_max[1], True).ro == 1) AncOfNewChild = robjects.r.which(pathMatrix.rx(True, ix_max[0]).ro == 1) pathMatrix.rx[AncOfNewChild, DescOfNewChild] = 1 scoreMat.rx[robjects.r.t(pathMatrix).ro == 1] = -float('inf') scoreMat.rx[ix_max[1], ix_max[0]] = -float('inf') scoreVec.append(sum(scoreNodes)) edgeList.append(list(ix_max)) scoreMat = robjects.r.updateScoreMat(scoreMat, X, scoreName='SEMGAM', i=ix_max[0], j=ix_max[1], scoreNodes=scoreNodes, Adj=Adj, numCores=1, output=False, maxNumParents=maxNumParents, parsScore=ListVector({'numBasisFcts': 10}), intervMat=float('nan'), intervData=False) counterUpdate = counterUpdate + 1 if pruning_type != None: # Adj is the out put pass return np.array(Adj)
def pruning_cam(XX, Adj): X2 = numpy2ri.py2rpy(XX) Adj = _pruning(X = X2, G = Adj, pruneMethod = robjects.r.selGam, pruneMethodPars = ListVector({'cutOffPVal': 0.001, 'numBasisFcts': 10}), output = False) return Adj