def logCountsWithFactors(counts, size_factors): """ Uses the R package scater to log a matrix of counts (genes as rows) and a vector of size factor using the method normalize(). :param counts: a matrix of counts (genes as rows) :param size_factors: a vector of size factors :return the normalized log counts (genes as rows) """ columns = counts.columns indexes = counts.index pandas2ri.activate() r_counts = pandas2ri.py2ri(counts) scater = RimportLibrary("scran") r_call = """ function(counts, size_factors){ sce = SingleCellExperiment(assays=list(counts=as.matrix(counts))) sizeFactors(sce) = size_factors sce = normalize(sce) norm_counts = logcounts(sce) return(as.data.frame(norm_counts)) } """ r_func = r(r_call) r_norm_counts = r_func(r_counts, size_factors) pandas_norm_counts = pandas2ri.ri2py(r_norm_counts) pandas_norm_counts.index = indexes pandas_norm_counts.columns = columns pandas2ri.deactivate() return pandas_norm_counts
def testActivate(self): robjects.conversion.py2ri = robjects.default_py2ri self.assertNotEqual(rpyp.pandas2ri, robjects.conversion.py2ri) rpyp.activate() self.assertEqual(rpyp.pandas2ri, robjects.conversion.py2ri) rpyp.deactivate() self.assertEqual(robjects.default_py2ri, robjects.conversion.py2ri)
def deaScranDESeq2(counts, conds, comparisons, alpha, scran_clusters=False): """Makes a call to DESeq2 with SCRAN to perform D.E.A. in the given counts matrix with the given conditions and comparisons. Returns a list of DESeq2 results for each comparison """ results = list() n_cells = len(counts.columns) try: pandas2ri.activate() deseq2 = RimportLibrary("DESeq2") scran = RimportLibrary("scran") multicore = RimportLibrary("BiocParallel") multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1)) as_matrix = r["as.matrix"] # Create the R conditions and counts data r_counts = pandas2ri.py2ri(counts) cond = robjects.StrVector(conds) r_call = """ function(r_counts) { sce = SingleCellExperiment(assays=list(counts=r_counts)) return(sce) } """ r_func = r(r_call) sce = r_func(as_matrix(r_counts)) if scran_clusters: r_clusters = scran.quickCluster(as_matrix(r_counts), max(n_cells/10, 10)) min_cluster_size = min(Counter(r_clusters).values()) sizes = list(set([round((min_cluster_size/2) / i) for i in [5,4,3,2,1]])) sce = scran.computeSumFactors(sce, clusters=r_clusters, sizes=sizes, positive=True) else: sizes = list(set([round((n_cells/2) * i) for i in [0.1,0.2,0.3,0.4,0.5]])) sce = scran.computeSumFactors(sce, sizes=sizes, positive=True) sce = r.normalize(sce) dds = r.convertTo(sce, type="DESeq2") r_call = """ function(dds, conditions){ colData(dds)$conditions = as.factor(conditions) design(dds) = formula(~ conditions) return(dds) } """ r_func = r(r_call) dds = r_func(dds, cond) dds = r.DESeq(dds) # Perform the comparisons and store results in list for A,B in comparisons: result = r.results(dds, contrast=r.c("conditions", A, B), alpha=alpha) result = r['as.data.frame'](result) genes = r['rownames'](result) result = pandas2ri.ri2py_dataframe(result) # There seems to be a problem parsing the rownames from R to pandas # so we do it manually result.index = genes results.append(result) pandas2ri.deactivate() except Exception as e: raise e return results
def fit(self, dfx: pd.DataFrame, outcome_col, covariate_cols, teacher_id_col, **argv): covariate_cols_except_fixed = [ x for x in covariate_cols if x not in self.fixed_effect_cols ] fixed_effect_cols_plus_tid = [teacher_id_col] + self.fixed_effect_cols dropna_subset_cols = [outcome_col ] + covariate_cols + fixed_effect_cols_plus_tid formula = create_felm_formula(outcome_col, covariate_cols_except_fixed, fixed_effect_cols_plus_tid, self.factor_cols) pandas2ri.activate() df_use = dfx.dropna(subset=dropna_subset_cols) _res1 = self.r.assign("r_df", pandas2ri.py2rpy(df_use)) _res2 = self.r( "res <- lfe::felm({formula}, r_df)".format(formula=formula)) bb = self.r("lfe::getfe(res)") self.effect = bb self.residuals_without_fixed = pd.Series(index=dfx.index) self.residuals_without_fixed.loc[df_use.index, ] = self.r( "res$r.residuals")[:, 0] self.residuals_with_fixed = pd.Series(index=dfx.index) self.residuals_with_fixed.loc[df_use.index, ] = self.r( "res$residuals")[:, 0] pandas2ri.deactivate()
def computeMnnBatchCorrection(counts): """Computes batch correction to a list of batches (data frames) where each data frame represents a batch (animal for instance). The batch correction is computed using Scran::mnnCorrect() from Marioni et al. :param counts: a list of matrices of counts :return returns a list of batch corrected matrices of counts """ pandas2ri.activate() as_matrix = r["as.matrix"] meta = [(x.index,x.columns) for x in counts] r_counts = [as_matrix(pandas2ri.py2ri(x)) for x in counts] RimportLibrary("scran") r_call = """ function(counts) { norm_counts = do.call(mnnCorrect, c(counts, cos.norm.out=FALSE)); return(lapply(norm_counts$corrected, as.data.frame)) } """ r_func = r(r_call) norm_counts = list() for i,x in enumerate(r_func(r_counts)): norm_c = pandas2ri.ri2py(x) norm_c.index = meta[i][0] norm_c.columns = meta[i][1] norm_counts.append(norm_c) pandas2ri.deactivate() return norm_counts
def slingshot(adata, start, n_pcs=5, cl=None): import numpy as np import pandas as pd import rpy2.robjects as ro from rpy2.robjects import numpy2ri, pandas2ri from rpy2.robjects.packages import importr importr('slingshot') numpy2ri.activate() pandas2ri.activate() ro.r.assign('pca', adata.obsm['X_pca'][:, :n_pcs]) ro.r.assign('cl', adata.obs[cl]) ro.reval('sds <- newSlingshotDataSet(pca, cl)') ro.reval(f'sce <- slingshot(sds, cl, start.clus="{start}")') pt = pd.DataFrame(np.asarray(ro.reval('slingPseudotime(sce)')), index=adata.obs_names) pt.columns = [f'{cl}_lineage_{c}' for c in pt.columns] try: adata.obs = adata.obs.drop(pt.columns, axis=1) except KeyError: print('PT keys not dropped in obs dataframe: Not found.') adata.obs = pd.concat([adata.obs, pt], axis=1) adata.uns['slingshot'] = {} adata.uns['slingshot']['lineages'] = {} lineages = np.asarray(np.asarray(ro.reval('sce@lineages'))) for i, l in enumerate(lineages): adata.uns['slingshot']['lineages'][i] = list(np.asarray(l)) numpy2ri.deactivate() pandas2ri.deactivate() return adata
def Rtsne(counts, dimensions, theta=0.5, dims=50, perplexity=30, max_iter=1000): """Performs dimensionality reduction using the R package Rtsne""" pandas2ri.activate() r_counts = pandas2ri.py2ri(counts) tsne = RimportLibrary("Rtsne") multicore = RimportLibrary("BiocParallel") multicore.register( multicore.MulticoreParam(multiprocessing.cpu_count() - 1)) as_matrix = r["as.matrix"] tsne_out = tsne.Rtsne(as_matrix(counts), dims=dimensions, theta=theta, check_duplicates=False, pca=True, initial_dims=dims, perplexity=perplexity, max_iter=max_iter, verbose=False) pandas_tsne_out = pandas2ri.ri2py(tsne_out.rx2('Y')) pandas2ri.deactivate() return pandas_tsne_out
def oneWay_rmAnova(DV, ID, IV): ''' Parameters ---------- DV : list/array Dependent variable as a singular list/array (will make a df-longside) ID : list/array Repeated measure: list/array of assigned identities IV : list/array The independent variable (condition) you are testing across as a list/array ''' from rpy2.robjects import pandas2ri pandas2ri.activate() df = pd.DataFrame({'DV': DV, 'ID': ID, 'IV': IV}) r_df = pandas2ri.py2ri(df) afex = importr('afex') model = afex.aov_ez('ID', 'DV', r_df, within='IV') print(R.summary(model)) # esm = importr("emmeans", on_conflict="warn") esm = importr("emmeans") pairwise = esm.lsmeans(model, "IV", contr="pairwise", adjust="holm") print(R.summary(pairwise)) pandas2ri.deactivate() return R.summary(pairwise)
def testSeries(self): Series = pandas.core.series.Series s = Series(numpy.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) rpyp.activate() rp_s = robjects.conversion.py2ri(s) rpyp.deactivate() self.assertEqual(rinterface.FloatSexpVector, type(rp_s))
def apply_transferFunction_metric(r_stream1, r_stream2, evalresp1, evalresp2): """" Invoke a named "correlation" R metric and convert the R dataframe result into a Pandas dataframe. :param r_stream1: an r_stream object :param r_stream2: an r_stream object :param metric_function_name: the name of the set of metrics :return: """ R_function = robjects.r('IRISMustangMetrics::transferFunctionMetric') # NOTE: Conversion of dataframes only works if you activate but we don't want conversion # NOTE: to always be automatic so we deactivate() after we're done converting. pandas2ri.activate() r_evalresp1 = pandas2ri.py2ri_pandasdataframe(evalresp1) r_evalresp2 = pandas2ri.py2ri_pandasdataframe(evalresp2) pandas2ri.deactivate() # TODO: Can we just activate/deactivate before/after R_function() without converting # TODO: r_evalresp1/2 ahead of time? # Calculate the metric r_metriclist = R_function(r_stream1, r_stream2, r_evalresp1, r_evalresp2) r_dataframe = _R_metricList2DF(r_metriclist) pandas2ri.activate() df = pandas2ri.ri2py_dataframe(r_dataframe) pandas2ri.deactivate() # Convert columns from R POSIXct to pyton UTCDateTime df.starttime = df.starttime.apply(UTCDateTime) df.endtime = df.endtime.apply(UTCDateTime) return df
def computeSumFactors(counts, scran_clusters=True): """ Compute normalization factors using the deconvolution method described in Merioni et al. Returns the computed size factors as a vector. :param counts: a matrix of counts (genes as rows) :return returns the normalization factors a vector """ n_cells = len(counts.columns) pandas2ri.activate() r_counts = pandas2ri.py2ri(counts) scran = RimportLibrary("scran") multicore = RimportLibrary("BiocParallel") multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1)) as_matrix = r["as.matrix"] if scran_clusters: r_clusters = scran.quickCluster(as_matrix(r_counts), max(n_cells/10, 10)) min_cluster_size = min(Counter(r_clusters).values()) sizes = list(set([round((min_cluster_size/2) / i) for i in [5,4,3,2,1]])) dds = scran.computeSumFactors(as_matrix(r_counts), clusters=r_clusters, sizes=sizes, positive=True) else: sizes = list(set([round((n_cells/2) * i) for i in [0.1,0.2,0.3,0.4,0.5]])) dds = scran.computeSumFactors(as_matrix(r_counts), sizes=sizes, positive=True) pandas_sf = pandas2ri.ri2py(dds) pandas2ri.deactivate() return pandas_sf
def compute_pairwise_information(cls, data, method, kwargs=None): logger = get_logger(__name__) if method == 'mutual-information': minet = importr('minet') pandas2ri.activate() if kwargs is None: kwargs = {} estimator = kwargs.pop('estimator', 'mi.shrink') disc = kwargs.pop('disc', 'equalwidth') nbins = kwargs.pop('nbins', np.sqrt(len(data.columns))) logger.debug('Running minet.build_mim(estimator={!r}, ' 'disc={!r}, nbins={!r})'.format( estimator, disc, nbins)) r_info = minet.build_mim(data.T, estimator=estimator, disc=disc, nbins=nbins, **kwargs) info = np.asarray(r_info) del r_info, minet gc.collect() pandas2ri.deactivate() else: raise ValueError( 'Unsupported information method: {!r}'.format(method)) info = pd.DataFrame(info, index=data.index, columns=data.index) return info
def generate_solutions_tables(self): ''' code from Adam use rpy2 to execute rcode which reads out a solutions file to pandas ''' col_names = [ 'alpha', 'tau', 'AT', 'b', 'delta', 'LL', 'mode_curv', 'genome mass', 'sigma.h.hat', 'theta.z.hat', 'sigma.A.hat', 'theta.Q.hat', 'lambda.hat', 'theta.0', 'frac.het', 'SCNA_LL', 'entropy', 'Kar_LL', 'WGD', 'combined_LL', 'SSNV_LL', 'SCNA_Theta_integral', 'dens' ] # Build R function to be used as a python package load_RData_func_str = """ load_RData <- function(file_path) { load(file_path) head_name <- ls()[1] file_name <- names(`segobj.list`)[1] r_data <- `segobj.list`[[file_name]]$mode.res$mode.tab return(r_data) } """ # Pack the function above as a package r_pack = SignatureTranslatedAnonymousPackage(load_RData_func_str, "r_pack") print 'Generating absolute tables for ' + str(len( self.data_table)) + ' samples' pandas2ri.activate() for index, row in self.data_table.iterrows(): if np.mod(index, 100) == 0: print str(index) + '/' + str(len(self.data_table)) r_data = r_pack.load_RData(row['absolute_summary_data']) abs_table = pd.DataFrame(pandas2ri.ri2py(r_data), columns=col_names) self.pp_modes_tables[row['pair_id']] = abs_table pandas2ri.deactivate()
def computeSumFactors(counts, scran_clusters=True): """ Compute normalization factors using the deconvolution method described in Marioni et al. Returns the computed size factors as a vector. :param counts: a matrix of counts (genes as rows) :return returns the normalization factors a vector """ n_cells = len(counts.columns) pandas2ri.activate() r_counts = pandas2ri.py2ri(counts) scran = RimportLibrary("scran") as_matrix = r["as.matrix"] if scran_clusters and n_cells >= 50: r_clusters = scran.quickCluster(as_matrix(r_counts), min(n_cells/10, 10), method="igraph") min_cluster_size = min(Counter(r_clusters).values()) sizes = list(range(min(int(min_cluster_size/4), 10), min(int(min_cluster_size/2), 50), 5)) dds = scran.computeSumFactors(as_matrix(r_counts), clusters=r_clusters, sizes=sizes) else: sizes = list(range(min(int(n_cells/4), 10), min(int(n_cells/2), 50), 5)) dds = scran.computeSumFactors(as_matrix(r_counts), sizes=sizes) pandas_sf = pandas2ri.ri2py(dds) pandas2ri.deactivate() return pandas_sf
def apply_PSD_plot(r_stream, filepath, evalresp=None): """" Create a PSD plot which will be written to a .png file opened 'png' file. :param r_stream: an r_stream object :param filepath: file path for png output :param evalresp: (optional) pandas dataframe of FAP from evalresp (freq,amp,phase) :return: """ result = robjects.r('grDevices::png')(filepath) r_psdList = robjects.r('IRISSeismic::psdList')(r_stream) if len(r_psdList) == 0: raise Exception("No PSDs returned") pandas2ri.activate() # convert pandas df to R df as parameter automatically if evalresp is not None: r_evalresp = pandas2ri.py2ri(evalresp) # convert to R dataframe result = robjects.r('IRISSeismic::psdPlot')(r_psdList, style='pdf', evalresp=r_evalresp) else: result = robjects.r('IRISSeismic::psdPlot')(r_psdList, style='pdf') pandas2ri.deactivate() result = robjects.r('grDevices::dev.off')() return True
def convert_rdata_to_dataframe ( filename ) : # from rpy2.robjects import r as R from rpy2.robjects.packages import importr from rpy2.robjects import pandas2ri from rpy2.robjects.conversion import localconverter import rpy2.robjects as ro # print ( 'WARNING THIS PROGRAM NEED VALUE ERROR CHECKING' ) rd_ = R.load( filename ) if 'matrix' in str( type( R[rd_[0]] ) ).lower() : column_names = [ R[rd_[0]].colnames ] index_names = [ R[rd_[0]].rownames ] else : column_names = [ [r for r in _rd_.colnames] for _rd_ in R[rd_[0]]] index_names = [ [r for r in _rd_.rownames] for _rd_ in R[rd_[0]]] # pandas2ri.activate() # # SMALL HELPER FUNCTION THAT TRANSFORMS A RDATA OBJECT INTO # A PANDAS DATAFRAME. CURRENTLY THERE IS NO VALUE ERROR CHECKING # rd = R.load( filename ) raw_df_l = [] if 'ndarray' in str( type( R[rd[0]] ) ).lower() : [ raw_df_l.append( R[rd[0]] ) ] else : [ raw_df_l.append( rdf ) for rdf in ro.vectors.DataFrame(R[rd[0]]) ] full_df_dict = {} ; i_ = 0 for raw_df,colnames,rownames in zip( raw_df_l,column_names,index_names ) : pdf = pd.DataFrame( raw_df , columns=colnames , index=rownames ) full_df_dict[i_] = pdf i_ = i_ + 1 pandas2ri.deactivate() return ( full_df_dict )
def apply_transferFunction_metric(r_stream1, r_stream2, evalresp1, evalresp2): """" Invoke a named "correlation" R metric and convert the R dataframe result into a Pandas dataframe. :param r_stream1: an r_stream object :param r_stream2: an r_stream object :param evalresp1: pandas DataFrame of evalresp FAP for r_stream1 :param evalresp2: pandas DataFrame of evalresp FAP for r_stream2 :return: """ R_function = robjects.r('IRISMustangMetrics::transferFunctionMetric') # NOTE: Conversion of dataframes only works if you activate but we don't want conversion # NOTE: to always be automatic so we deactivate() after we're done converting. pandas2ri.activate() r_evalresp1 = pandas2ri.py2ri_pandasdataframe(evalresp1) r_evalresp2 = pandas2ri.py2ri_pandasdataframe(evalresp2) pandas2ri.deactivate() # TODO: Can we just activate/deactivate before/after R_function() without converting # TODO: r_evalresp1/2 ahead of time? # Calculate the metric r_metriclist = R_function(r_stream1, r_stream2, r_evalresp1, r_evalresp2) r_dataframe = _R_metricList2DF(r_metriclist) pandas2ri.activate() df = pandas2ri.ri2py_dataframe(r_dataframe) pandas2ri.deactivate() # Convert columns from R POSIXct to pyton UTCDateTime df.starttime = df.starttime.apply(UTCDateTime) df.endtime = df.endtime.apply(UTCDateTime) return df
def fit_and_predict(self, train, horizon): r_string = """ function(data, frequency, horizon){ library(forecast) if(length(frequency) == 1){ ts_data <- ts(data, frequency=frequency) }else{ ts_data <- msts(data, seasonal.periods=frequency) } fit <- tbats(ts_data) fitted_df <- data.frame(fit$fitted.values) forecast <- forecast(fit, h = horizon) forecast_df <- data.frame(forecast) output <- list(fitted_df, forecast_df) return(output) } """ r_func = robjects.r(r_string) pandas2ri.activate() output_list = r_func(train, robjects.IntVector(self.frequency), horizon) fit = pandas2ri.ri2py(output_list[0]) forecast = pandas2ri.ri2py(output_list[1]) pandas2ri.deactivate() return fit, forecast
def fit_and_predict(self, train, horizon): r_string = """ function(data, frequency, horizon){ library(forecast) ts_data <- ts(data, frequency=frequency) fit <- HoltWinters(ts_data) fitted_df <- data.frame(fit$fitted) forecast <- forecast(fit, h = horizon) forecast_df <- data.frame(forecast) output <- list(fitted_df, forecast_df) return(output) } """ r_func = robjects.r(r_string) # Run R pandas2ri.activate() output_list = r_func(train, self.frequency, horizon) fit = pandas2ri.ri2py(output_list[0]) forecast = pandas2ri.ri2py(output_list[1]) pandas2ri.deactivate() return fit, forecast
def r(code=None, path=None, rel=True, conda=True, convert=True, repo='https://cran.microsoft.com/', **kwargs): ''' Runs the R script and returns the result. :arg str code: R code to execute. :arg str path: R script path. Cannot be used if code is specified :arg bool rel: True treats path as relative to the caller function's file :arg bool conda: True overrides R_HOME to use the Conda R :arg bool convert: True converts R objects to Pandas and vice versa :arg str repo: CRAN repo URL All other keyword arguments as passed as parameters ''' # Use Conda R if possible if conda: r_home = _conda_r_home() if r_home: os.environ['R_HOME'] = r_home # Import the global R session try: from rpy2.robjects import r, pandas2ri, globalenv except ImportError: app_log.error('rpy2 not installed. Run "conda install rpy2"') raise except RuntimeError: app_log.error('Cannot find R. Set R_HOME env variable') raise # Set a repo so that install.packages() need not ask for one r('local({r <- getOption("repos"); r["CRAN"] <- "%s"; options(repos = r)})' % repo) # Activate or de-activate automatic conversion # https://pandas.pydata.org/pandas-docs/version/0.22.0/r_interface.html if convert: pandas2ri.activate() else: pandas2ri.deactivate() # Pass all other kwargs as global environment variables for key, val in kwargs.items(): globalenv[key] = val if code and path: raise RuntimeError('Use r(code=) or r(path=...), not both') if path: # if rel=True, load path relative to parent directory if rel: stack = inspect.getouterframes(inspect.currentframe(), 2) folder = os.path.dirname(os.path.abspath(stack[1][1])) path = os.path.join(folder, path) result = r.source(path, chdir=True) # source() returns a withVisible: $value and $visible. Use only the first result = result[0] else: result = r(code) return result
def full_converter() -> conversion.Converter: pandas2ri.activate() new_converter = conversion.Converter("anndata conversion", template=conversion.converter) pandas2ri.deactivate() overlay_converter(converter, new_converter) return new_converter
def fit( self, x: Optional[np.ndarray] = None, y: Optional[np.ndarray] = None, w: Optional[np.ndarray] = None, **kwargs, ) -> "GamMGCVModel": """ Fit the model. Params ------ x Independent variables. y Dependent variables. w Weights of :paramref:`x`. kwargs Keyword arguments. Returns ------- :class:`cellrank.ul.models.GamMGCVModel` Return fitted self. """ from rpy2 import robjects from rpy2.robjects import pandas2ri, Formula from rpy2.robjects.packages import importr super().fit(x, y, w, **kwargs) use_ixs = np.where(self.w > 0)[0] self._x = self.x[use_ixs] self._y = self.y[use_ixs] self._w = self.w[use_ixs] n_splines = kwargs.pop("n_splines", self._n_splines) mgcv = importr("mgcv") pandas2ri.activate() df = pandas2ri.py2rpy( pd.DataFrame(np.c_[self.x, self.y][use_ixs, :], columns=["x", "y"])) self._model = mgcv.gam( Formula(f'y ~ s(x, k={n_splines}, bs="cr")'), data=df, sp=self._sp, family=robjects.r.gaussian, weights=pd.Series(self.w[use_ixs]), ) pandas2ri.deactivate() return self
def testActivate(self): #FIXME: is the following still making sense ? self.assertNotEqual(rpyp.py2ri, robjects.conversion.py2ri) l = len(robjects.conversion.py2ri.registry) k = set(robjects.conversion.py2ri.registry.keys()) rpyp.activate() self.assertTrue(len(conversion.py2ri.registry) > l) rpyp.deactivate() self.assertEqual(l, len(conversion.py2ri.registry)) self.assertEqual(k, set(conversion.py2ri.registry.keys()))
def testRi2pandas(self): rdataf = robjects.r('data.frame(a=1:2, b=I(c("a", "b")), c=c("a", "b"))') rpyp.activate() pandas_df = robjects.conversion.ri2py(rdataf) rpyp.deactivate() self.assertIsInstance(pandas_df, pandas.DataFrame) self.assertEquals(('a', 'b', 'c'), tuple(pandas_df.keys())) self.assertEquals(pandas_df['a'].dtype, numpy.dtype('int32')) self.assertEquals(pandas_df['b'].dtype, numpy.dtype('O')) self.assertEquals(pandas_df['c'].dtype, numpy.dtype('O'))
def testSeries_issue264(self): Series = pandas.core.series.Series s = Series(('a', 'b', 'c', 'd', 'e'), index=pandas.Int64Index([0,1,2,3,4])) rpyp.activate() rp_s = robjects.conversion.py2ri(s) rpyp.deactivate() # segfault before the fix str(rp_s) self.assertEqual(rinterface.ListSexpVector, type(rp_s))
def convertRtoPandas(file_path): # Pack the function above as a package r_pack = SignatureTranslatedAnonymousPackage(load_RData_func_str, "r_pack") pandas2ri.activate() r_data = r_pack.load_RData(file_path) py_data = pd.DataFrame(pandas2ri.ri2py(r_data), columns=col_names) pandas2ri.deactivate() return py_data
def full_converter() -> conversion.Converter: pandas2ri.activate() new_converter = conversion.Converter("anndata conversion", template=conversion.converter) pandas2ri.deactivate() overlay_converter(scipy2ri.converter, new_converter) # overwrite the scipy2ri Sexp4 converter and add our others overlay_converter(converter, new_converter) return new_converter
def rdf_to_pydf(x): """Convert an R dataframe to a python dataframe""" ''' The converter is activated and then deactivated. There have been some reports of inconsistencies if the converter is activated during import ''' pandas2ri.activate() df = pandas2ri.ri2py(x) pandas2ri.deactivate() return df
def testActivate(self): #FIXME: is the following still making sense ? assert rpyp.py2rpy != robjects.conversion.py2rpy l = len(robjects.conversion.py2rpy.registry) k = set(robjects.conversion.py2rpy.registry.keys()) rpyp.activate() assert len(conversion.py2rpy.registry) > l rpyp.deactivate() assert len(conversion.py2rpy.registry) == l assert set(conversion.py2rpy.registry.keys()) == k
def fit(self, X, y, **kwargs): # Do one-hot encoding self.cat_columns = X.columns[X.dtypes == object].values.tolist() X = pd.get_dummies(X) import rpy2.robjects as ro from rpy2.robjects import pandas2ri, Formula assert not self.is_fit(), 'Call fit() twice!' if self.clean_feature_names is None: # R can not accept wierd symbols as names self.clean_feature_names = [] for name in self.feature_names: the_name = name.replace('-', '_').replace(' ', '_').replace('$', '').replace('/', '_')\ .replace('>', '_big_').replace('(', '_lq_').replace(')', '_rq_').replace('?', '_ques_')\ .replace('.', '_dot_').replace('&', '_and_') if the_name.startswith('_'): the_name = 's_%s' % (the_name[1:]) self.clean_feature_names.append(the_name) # Create the fitting string e.g. 'y~s(age)+s(BUN_level)+gender' formula_terms = [] for feat_name, clean_feat_name in zip(self.feature_names, self.clean_feature_names): num_unique_x = len(self.X_values_counts[feat_name]) if num_unique_x < 2: continue term_str = "%s" % clean_feat_name if num_unique_x == 2 \ else "s(%s, bs='cr', k=%d)" % (clean_feat_name, min(self.maxk, int(num_unique_x*2/3))) formula_terms.append(term_str) formula_str = 'y~%s' % ('+'.join(formula_terms)) print('formula_str:', formula_str) formula = Formula(formula_str) pandas2ri.activate() env = formula.environment env['y'] = y for feat_name, clean_feat_name in zip(self.feature_names, self.clean_feature_names): env[clean_feat_name] = X[feat_name] # with Timer('Fitting the R mgcv model'): self.R_model = ro.r[self.model_to_use](formula, family=self.family, nthreads=self.nthreads, discrete=self.discrete, select=self.select) pandas2ri.deactivate()
def computeNClusters(counts, min_size=20): """Computes the number of clusters from the data using Scran::quickCluster""" pandas2ri.activate() r_counts = pandas2ri.py2ri(counts.transpose()) scran = RimportLibrary("scran") as_matrix = r["as.matrix"] clusters = scran.quickCluster(as_matrix(r_counts), min_size, method="igraph") n_clust = len(set(clusters)) pandas2ri.deactivate() return n_clust
def _save_data_as_rdata(file_name, data, cvindices): import rpy2.robjects as rn from .rpy2_helper import r_assign, r_save_to_disk from rpy2.robjects import pandas2ri data = set_defaults_for_data(data) assert check_data(data) fields_to_save = [ "format", "Y", "sample_weights", "outcome_name", "variable_names" ] try: for k in fields_to_save: r_assign(data[k], k) except: from eqm.debug import ipsh ipsh() r_assign(cvindices, "cvindices") pandas2ri.activate() X_df = pd.DataFrame(data=data['X']) X_df.columns = data['variable_names'] rn.r.assign('X', X_df) # test set has_test_set = ('X_test' in data) and ('Y_test' in data) and ('sample_weights_test' in data) if has_test_set: X_test_df = pd.DataFrame(data=data['X_test']) X_test_df.columns = data['variable_names'] rn.r.assign('X_test', pandas2ri.py2ri(X_test_df)) r_assign(data['Y_test'], 'Y_test') r_assign(data['sample_weights_test'], 'sample_weights_test') else: rn.reval(""" X_test = matrix(data=NA, nrow = 0, ncol = ncol(X)); Y_test = matrix(data=NA, nrow = 0, ncol = 1); sample_weights_test = matrix(data=1.0, nrow = 0, ncol = 1); """) pandas2ri.deactivate() variables_to_save = fields_to_save + [ "cvindices", "X", "X_test", "Y_test", "sample_weights_test" ] r_save_to_disk(file_name, variables_to_save) return True
def testRi2pandas_issue207(self): d = robjects.DataFrame({'x': 1}) rpyp.activate() try: ok = True robjects.globalenv['d'] = d except ValueError: ok = False finally: rpyp.deactivate() if 'd' in robjects.globalenv: del(robjects.globalenv['d']) self.assertTrue(ok)
def computeNClusters(counts, min_size=20): """Computes the number of clusters from the data using Scran::quickCluster""" pandas2ri.activate() r_counts = pandas2ri.py2ri(counts.transpose()) scran = RimportLibrary("scran") multicore = RimportLibrary("BiocParallel") multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1)) as_matrix = r["as.matrix"] clusters = scran.quickCluster(as_matrix(r_counts), min_size) n_clust = len(set(clusters)) pandas2ri.deactivate() return n_clust
def opt_imp( data, methods, n_iter=10, freq=1440, measures={ 'SMAE': pf.smae, 'RMSE': pf.rmse, 'SRMSE': pf.srmse, 'SMAPE': pf.smape, 'MASE': partial(pf.mase, shift=60 * 24 * 7) }): dist = out_dist(data) # get the distribution of outage lengths data_lno = lno(data) # get the longest no outage (LNO) ts = ro.r.ts # R time series object pandas2ri.activate() # activate connection results = [] # initialize empty list for results for i in range( n_iter ): # repeat multiple times becaouse of random nature of outage additions data_out = add_out(data=data_lno, dist=dist) # add outages data_out_ts = ts(ro.FloatVector(data_out.values), frequency=freq ) # construct time series object & estimate frequency result = pd.DataFrame() # empty dataframe for scores for method in methods: # for each method under consideration name = method['name'] # get name alg = method['alg'] # get algorithm opt = method['opt'] # get options for kwargs in dp.dol2lod(opt): # for all combinations of kwargs print(str(i) + ':', kwargs) # progress update data_imp = pd.Series( index=data_out.index, data=np.reshape(pandas2ri.ri2py(alg(data_out_ts, **kwargs)), newshape=data_out.shape, order='C') ) # get results of imputation from R & construct DataFrame using original index and columns #data_imp=imp(data=data_out,alg=alg,**kwargs) # impute data with said methods label = ','.join([name] + [ str(key) + ':' + str(kwargs[key]) for key in sorted(kwargs) ]) # build entry label from sorted keys pfm = pf.ev(pred=data_imp, true=data_lno, label=label, measures=measures) # evaluate performance result = pd.concat([result, pfm]) # append computed performance result.index.name = 'method' # name index column results.append(result) # add to results pandas2ri.deactivate() # deactivate connection return sum(results) / n_iter
def computeNClusters(counts, min_size=20): """Computes the number of clusters from the data using Scran::quickCluster""" pandas2ri.activate() r_counts = pandas2ri.py2ri(counts.transpose()) scran = RimportLibrary("scran") multicore = RimportLibrary("BiocParallel") multicore.register( multicore.MulticoreParam(multiprocessing.cpu_count() - 1)) as_matrix = r["as.matrix"] clusters = scran.quickCluster(as_matrix(r_counts), min_size) n_clust = len(set(clusters)) pandas2ri.deactivate() return n_clust
def computeSizeFactors(counts): """ Computes size factors using DESeq for the counts matrix given as input (Genes as rows and spots as columns). Returns the computed size factors as a vector. :param counts: a matrix of counts (genes as rows) :return returns the normalization factors a vector """ pandas2ri.activate() r_counts = pandas2ri.py2ri(counts) deseq2 = RimportLibrary("DESeq2") dds = deseq2.estimateSizeFactorsForMatrix(r_counts) pandas_sf = pandas2ri.ri2py(dds) pandas2ri.deactivate() return pandas_sf
def computeRLEFactors(counts): """ Compute normalization size factors using the RLE method described in EdgeR and returns then as a vector. :param counts: a matrix of counts (genes as rows) :return returns the normalization factors a vector """ pandas2ri.activate() r_counts = pandas2ri.py2ri(counts) edger = RimportLibrary("edgeR") as_matrix = r["as.matrix"] dds = edger.calcNormFactors(as_matrix(r_counts), method="RLE") pandas_sf = pandas2ri.ri2py(dds) pandas_cm = pandas2ri.ri2py(r.colSums(counts)) pandas2ri.deactivate() return pandas_sf * pandas_cm
def testRepr(self): # this should go to testVector, with other tests for repr() l = (('b', numpy.array([True, False, True], dtype=numpy.bool_)), ('i', numpy.array([1, 2, 3], dtype="i")), ('f', numpy.array([1, 2, 3], dtype="f")), ('s', numpy.array(["a", "b", "c"], dtype="S")), ('u', numpy.array([u"a", u"b", u"c"], dtype="U"))) od = OrderedDict(l) pd_df = pandas.core.frame.DataFrame(od) rpyp.activate() rp_df = robjects.conversion.py2ri(pd_df) rpyp.deactivate() s = repr(rp_df) # used to fail with a TypeError s = s.split('\n') self.assertEqual('[Array, Array, Array, FactorV..., FactorV...]', s[1].strip())
def testDataFrame(self): l = (('b', numpy.array([True, False, True], dtype=numpy.bool_)), ('i', numpy.array([1, 2, 3], dtype="i")), ('f', numpy.array([1, 2, 3], dtype="f")), ('s', numpy.array(["a", "b", "c"], dtype="S")), ('u', numpy.array([u"a", u"b", u"c"], dtype="U")), ('dates', [datetime(2012, 5, 2), datetime(2012, 6, 3), datetime(2012, 7, 1)])) od = OrderedDict(l) pd_df = pandas.core.frame.DataFrame(od) rpyp.activate() rp_df = robjects.conversion.py2ri(pd_df) rpyp.deactivate() self.assertEqual(pd_df.shape[0], rp_df.nrow) self.assertEqual(pd_df.shape[1], rp_df.ncol)
def deaDESeq2(counts, conds, comparisons, alpha, size_factors=None): """Makes a call to DESeq2 to perform D.E.A. in the given counts matrix with the given conditions and comparisons. Can be given size factors. Returns a list of DESeq2 results for each comparison """ results = list() try: pandas2ri.activate() deseq2 = RimportLibrary("DESeq2") multicore = RimportLibrary("BiocParallel") multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1)) # Create the R conditions and counts data r_counts = pandas2ri.py2ri(counts) cond = robjects.DataFrame({"conditions": robjects.StrVector(conds)}) design = r('formula(~ conditions)') dds = r.DESeqDataSetFromMatrix(countData=r_counts, colData=cond, design=design) if size_factors is None: dds = r.DESeq(dds, parallel=True, useT=True, minmu=1e-6, minReplicatesForReplace=np.inf) else: assign_sf = r["sizeFactors<-"] dds = assign_sf(object=dds, value=robjects.FloatVector(size_factors)) dds = r.estimateDispersions(dds) dds = r.nbinomWaldTest(dds) # Perform the comparisons and store results in list for A,B in comparisons: result = r.results(dds, contrast=r.c("conditions", A, B), alpha=alpha, parallel=True) result = r['as.data.frame'](result) genes = r['rownames'](result) result = pandas2ri.ri2py_dataframe(result) # There seems to be a problem parsing the rownames from R to pandas # so we do it manually result.index = genes results.append(result) pandas2ri.deactivate() except Exception as e: raise e return results
def Rtsne(counts, dimensions, theta=0.5, dims=50, perplexity=30, max_iter=1000): """Performs dimensionality reduction using the R package Rtsne""" pandas2ri.activate() r_counts = pandas2ri.py2ri(counts) tsne = RimportLibrary("Rtsne") multicore = RimportLibrary("BiocParallel") multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1)) as_matrix = r["as.matrix"] tsne_out = tsne.Rtsne(as_matrix(counts), dims=dimensions, theta=theta, check_duplicates=False, pca=True, initial_dims=dims, perplexity=perplexity, max_iter=max_iter, verbose=False) pandas_tsne_out = pandas2ri.ri2py(tsne_out.rx2('Y')) pandas2ri.deactivate() return pandas_tsne_out
def apply_correlation_metric(r_stream1, r_stream2, metric_function_name, *args, **kwargs): """" Invoke a named "correlation" R metric and convert the R dataframe result into a Pandas dataframe. :param r_stream1: an r_stream object :param r_stream2: an r_stream object :param metric_function_name: the name of the set of metrics :return: """ function = 'IRISMustangMetrics::' + metric_function_name + 'Metric' R_function = robjects.r(function) pandas2ri.activate() r_metriclist = R_function(r_stream1, r_stream2, *args, **kwargs) # args and kwargs shouldn't be needed in theory pandas2ri.deactivate() r_dataframe = _R_metricList2DF(r_metriclist) df = pandas2ri.ri2py_dataframe(r_dataframe) # Convert columns from R POSIXct to pyton UTCDateTime df.starttime = df.starttime.apply(UTCDateTime) df.endtime = df.endtime.apply(UTCDateTime) return df
def calculate_prec(cross_df, automate= False): """ function that calculates the prec_inf using R and returns a fully contructed plottable dataframe Args: cross_df: pandas dataframe containing the data automate: bool, a To do feature to automatically calculate the best fit Returns: dataframe contining the R added precision values to be received most always by the plotting commander. """ import rpy2.robjects as ro from rpy2.robjects import pandas2ri from rpy2.robjects.packages import importr import rpy2.robjects.numpy2ri import rpy2.rinterface as rin stats = importr('stats') base = importr('base') # activate R environemnt in python rpy2.robjects.numpy2ri.activate() pandas2ri.activate() # read in necessary elements ofmenu = [("Item 1", "item_1_value"), ("Item 2", "item_2_value"), ("Item 3", "item_3_value")] df = pd.DataFrame({'x': cross_df['Kpoints_atom_density'], 'y': cross_df['Energy']}) ro.globalenv['dataframe']=df ### *** R used to obtain the fit on the data to calculate prec_inf *** ### # perform regression - bokeh widgets can be used here to provide the inputs to the nls regression # some python to R translation of object names via the pandas - R dataframes y = df['y'] x = df['x'] l = len(y) - 1 # needed because R indexes list from 1 to len(list) # ***WIDGET inputs*** # OR AUTOMATE # the slider inputs on starting point or can be automated also l1 = 3 l2 = 0 fitover = rin.SexpVector(list(range(l1,l-l2)), rin.INTSXP) # numeric entry widget for 'b' is plausible for user to choose best starting guess start_guess = {'a': y[l], 'b': 5} start=pandas2ri.py2ri(pd.DataFrame(start_guess,index=start_guess)) # drop down list selection of model model = 'y~a*x/(b+x)' # Minimize function with weights and selection m = \ stats.nls(model, start = start, algorithm = "port", subset = fitover, weights = x^2, data=base.as_symbol('dataframe')) # Estimation of goodness of fit g = stats.cor(y[l1:l-l2],stats.predict(m)) # Report summary of fit, values and error bars print( base.summary(m).rx2('coefficients') ) # Extrapolation value is given by a a = stats.coef(m)[1] # Calculation of precision prec = abs(y-a) # test print outs of the data ? how to render onto html like Shiny if necesary ? print("We learn that the converged value is: {0} and best precision achieved in the measurement is {1}".format(a, min(abs(prec)))) cross_df['Energy_Prec_Inf'] = prec # close the R environments rpy2.robjects.numpy2ri.deactivate() pandas2ri.deactivate() return (cross_df)
def testCategorical(self): factor = robjects.vectors.FactorVector(('a', 'b', 'a')) rpyp.activate() rp_c = robjects.conversion.ri2py(factor) rpyp.deactivate() self.assertEqual(pandas.Categorical, type(rp_c))