def R_rdc(X, y): numpy2ri.activate() rstring = """ library(foreach) library(doParallel) rdc <- function(x,y,k=20,s=1/6,f=sin) { x <- cbind(apply(as.matrix(x),2,function(u)rank(u)/length(u)),1) y <- cbind(apply(as.matrix(y),2,function(u)rank(u)/length(u)),1) x <- s/ncol(x)*x%*%matrix(rnorm(ncol(x)*k),ncol(x)) y <- s/ncol(y)*y%*%matrix(rnorm(ncol(y)*k),ncol(y)) tryCatch(cancor(cbind(f(x),1),cbind(f(y),1))$cor[1], error = function(e){0}) } rdcs_for_all <- function(X, y) { cl<-makeCluster(40) clusterExport(cl, c('rdc'), envir=environment()) registerDoParallel(cl) res = list() res <- foreach (c_=c(1:ncol(X))) %dopar% { rdc(y, X[,c_]) } stopCluster(cl) return(res) } """ rfunc=robjects.r(rstring) res = rfunc(X, y) return np.array([x[0] for x in res])
def make_plot(pre, post): """ Plot the pre return values and post return values """ data = make_returns_data(pre, post) numpy2ri.activate() column = robjects.r["c"] sequence = robjects.r["seq"] robjects.r["plot"]( x=2, y=2, xlim=column(0, len(data["pre"]) - 1), ylim=column(data["min"] - .1, data["max"] + .1), xlab="Event number", ylab="Returns" ) robjects.r["points"]( x=sequence(0, len(data["pre"]) - 1), y=data["pre"], col="red", pch=19 ) robjects.r["points"]( x=sequence(0, len(data["pre"]) - 1), y=data["post"], col="blue", pch=19 ) robjects.r["abline"](h=0) signal.signal(signal.SIGINT, lambda a, b: sys.exit(0)) signal.pause()
def extractTableFactor(self, tableFactor): formulaModel = [] formulaErrorTerm = [] numpy2ri.activate() for t in self.tableFactor: factorName = t[0] factorType = t[1] factorData = t[2] # sending Data to global variable in R (Factor definition for # Subject, Within or Between Type and FloatVector for Covariate if factorType == 'Covariate': tmp = robjects.FloatVector(factorData) robjects.globalenv[factorName] = tmp else: tmp = robjects.r.factor(factorData) robjects.globalenv[factorName] = tmp # Creating Fromula for R - different treatement for within and # between subject Factor if factorType == 'Subject': subjectName = factorName self.FactorSubject = factorData elif factorType == 'Within': formulaModel.append(factorName) formulaErrorTerm.append(factorName) else: formulaModel.append(factorName) return formulaModel, formulaErrorTerm, subjectName
def activate(): global original_converter # If module is already activated, there is nothing to do if original_converter is not None: return original_converter = conversion.make_converter('snapshot before pandas conversion', template=conversion.converter) numpy2ri.activate() new_converter = conversion.make_converter('snapshot before pandas conversion', template=conversion.converter) numpy2ri.deactivate() for k,v in py2ri.registry.items(): if k is object: continue new_converter.py2ri.register(k, v) for k,v in ri2ro.registry.items(): if k is object: continue new_converter.ri2ro.register(k, v) for k,v in py2ro.registry.items(): if k is object: continue new_converter.py2ro.register(k, v) for k,v in ri2py.registry.items(): if k is object: continue new_converter.ri2py.register(k, v) conversion.converter = new_converter name, conversion.ri2ro, conversion.py2ri, conversion.py2ro, conversion.ri2py, lineage = new_converter
def __init__(self, data, nclusters=2, eigengap=False): ro.r("source('Rcode/kNNutils.R')") numpy2ri.activate() self.data = data self.eigengap = eigengap self.dataDim = data.shape[1] self.dataSize = data.shape[0] self.nclusters = nclusters self.kNN = ro.r['getKNearestNeighbors']
def OnHeatmap(self,event): number = self.notebook.GetListTabId().index(self.notebook.GetCurrentTabId()) heatmapDlg = DialogHeatmap(title=u"Input for heatmap") pars = heatmapDlg.GetValue() print(pars) matStart = pars[0]-1 data = self.data[number] numData = numpy.array(data)[1:,matStart: ].astype(float) # get numpy data column items items = numpy.array(data)[0,matStart: ].astype(str) import rpy2.robjects as robjects from rpy2.robjects.packages import importr base = importr("base") from rpy2.robjects import numpy2ri numpy2ri.activate() # transfer the numpy array to matrix in R # transfer numpy data to r matrix numDataR = transposeNumpyMat2R(numData) numDataR.rownames = robjects.StrVector(items) # the numData column now is the row names of R matrix, heatmap3 use this format # get column side annotation colors # get color list for legend annoCols = [ x-1 for x in pars[1]] #annoColDicList =[] for n, annoCol in enumerate(annoCols): anno = numpy.array(data)[1:, int(annoCol)] annoColDic = getCategoryColorDic (list(set(anno)), colsDic) cols = getMemberColor(anno, annoColDic) if (n==0): annoColor1 = robjects.StrVector(cols) annoColDicList = [annoColDic] ColSideColors = base.cbind(annoColor1) # should use matrix in R instead of dataframe print annoColDicList if (n==1): annoColor2 = robjects.StrVector(cols) ColSideColors = base.cbind(annoColor1 , annoColor2) annoColDicList = annoColDicList + [annoColDic] print annoColDicList if (n>=2): annoColorX = robjects.StrVector(cols) ColSideColors = base.cbind(ColSideColors , annoColorX) annoColDicList = [annoColDicList, annoColDic] # for legend print base.dim(ColSideColors) annoName = robjects.StrVector(numpy.array(data)[0, annoCols]) ColSideColors.colnames = annoName outputDlg = OutputDialog() outPath = outputDlg.GetPath() print outPath fileName = outPath + "/heatmap.pdf" heatmap3py(numDataR, ColSideColors, annoColDicList, fileName=fileName, outPath=outPath) heatmap3py(numDataR, ColSideColors, annoColDicList)
def testActivateTwice(self): # setUp method has already activated numpy converter self.assertEqual(rpyn.numpy2ri, robjects.conversion.py2ri) rpyn.activate() self.assertEqual(rpyn.numpy2ri, robjects.conversion.py2ri) rpyn.deactivate() self.assertNotEqual(rpyn.numpy2ri, robjects.conversion.py2ri) rpyn.deactivate() self.assertNotEqual(rpyn.numpy2ri, robjects.conversion.py2ri)
def testActivate(self): rpyn.deactivate() #FIXME: is the following still making sense ? self.assertNotEqual(rpyn.py2ri, conversion.py2ri) l = len(conversion.py2ri.registry) k = set(conversion.py2ri.registry.keys()) rpyn.activate() self.assertTrue(len(conversion.py2ri.registry) > l) rpyn.deactivate() self.assertEqual(l, len(conversion.py2ri.registry)) self.assertEqual(k, set(conversion.py2ri.registry.keys()))
def load_ipython_extension(ip): """Load the extension in IPython.""" if pandas2ri: pandas2ri.activate() else: numpy2ri.activate() ip.register_magics(RMagics) # Initialising rpy2 interferes with readline. Since, at this point, we've # probably just loaded rpy2, we reset the delimiters. See issue gh-2759. if ip.has_readline: ip.readline.set_completer_delims(ip.readline_delims)
def wilcoxon_signed_rank_test(x, conflevel=0.95): """Wilcoxon signed rank test, with confidence interval. Requires rpy2.""" from rpy2.robjects import numpy2ri from rpy2.robjects.packages import importr numpy2ri.activate() r_stats = importr('stats') d = {'conf.int': True, 'conf.level': conflevel} r = r_stats.wilcox_test(np.array(x), **d) r = dict( statistic=np.asscalar(np.asarray(r.rx('statistic'))), pvalue=np.asscalar(np.asarray(r.rx('p.value'))), confint=[np.asscalar(x) for x in np.asarray(r.rx('conf.int')).flat], estimate=np.asscalar(np.asarray(r.rx('estimate'))), ) return r
def activate(): global original_py2ri, original_ri2ro # If module is already activated, there is nothing to do if original_py2ri: return #FIXME: shouldn't the use of numpy conversion be made # explicit in the pandas conversion ? # (and this remove the need to activate it ?) numpy2ri.activate() original_py2ri = conversion.py2ri original_ri2ro = conversion.ri2ro conversion.py2ri = pandas2ri conversion.ri2ro = ri2pandas
def _init_r(): """Private function to initialise R, only executed when needed. """ global _r_initialised global rpy2 global ro global r if not _r_initialised: import rpy2 # noqa import rpy2.robjects as ro # noqa from rpy2.robjects import r # noqa import rpy2.robjects.numpy2ri as numpy2ri numpy2ri.activate() _r_initialised = True
def OnPCA(self, event): # this function rely on vegan package number = self.notebook.GetCurrentDataId() pcaDlg = DialogPCA(title=u"Input for PCA") startColNum, groupColNum = pcaDlg.GetValue() import rpy2.robjects as robjects from rpy2.robjects.packages import importr base = importr("base") vegan = importr("vegan") graphics = importr("graphics") stats = importr("stats") data = self.notebook.data[number] # need to transpose if the sample is arranged along with column # python array is arranged by row # PCA is calculated with row numData = numpy.array(data)[1:,startColNum: ].astype(float) grp = numpy.array(data)[1:, groupColNum] # colsDic = {1:"red", 2:"orange", 3:"blue", 4:"forestgreen"} groupColDic = getCategoryColorDic (list(set(grp)), colsDic) cols = getMemberColor(grp, groupColDic) # col = [colsDic[x] for x in grp] # print col from rpy2.robjects import numpy2ri numpy2ri.activate() # transfer the numpy array to matrix in R pca = vegan.rda(numData, scale = True) col4R = robjects.StrVector(cols) # for vector, need to transfer explicitly, numpy2ri didn't work scl = 1 ## scaling graphics.plot(pca, display = "sites", scaling = scl , type = "n") # stats.biplot(pca, main = "biplot") graphics.points(pca, display = "sites", scaling = scl, col = col4R, pch = 16) # color map to the group # lev = base.levels(grp) lev = list(set(grp)) # print lev # please note the order of group set from python and R is different. next time just try to use one method for i in range(len(lev)): ## draw ellipse per group vegan.ordiellipse(pca, display = "sites", kind = "se", scaling = scl, groups = robjects.StrVector(grp), col = groupColDic[lev[i]], show_groups = lev[i]) ## centroids scrs = base.as_data_frame(vegan.scores(pca, display = "sites", scaling = scl, choices = robjects.IntVector([1,2]))) cent = base.do_call(base.rbind, base.lapply(base.split(scrs, robjects.StrVector(grp)), base.colMeans)) # split map scores to group centRowname = base.row_names(cent) centCols = [groupColDic[x] for x in centRowname] graphics.points(cent, col = robjects.StrVector(centCols), pch = 3, cex = 1.1)
def run(self, x, init=None, kRange = None): self.shape = x.shape numpy2ri.activate() r = robjects.r r.options(warn=-1) r.library('mclust') if kRange is None: kRane = arange(1,10) if init is None: mcr = r.Mclust(x) else: subset = random.sample(np.arange(self.shape[0]),init) subsetR = r['list'](subset=subset) mcr = r.Mclust(x,initialization=subsetR) self.mclustRes = dict([(i[0],i[1]) for i in mcr.iteritems()]) return self
def init_rpy(): global _rpy_initialized if _rpy_initialized: return _rpy_initialized = True from rpy2 import robjects from rpy2.robjects import numpy2ri import os path = os.path.dirname(__file__) robjects.r("options(warn=-1)") with open(path + "/rdc.R", "r") as rfile: code = ''.join(rfile.readlines()) robjects.r(code) numpy2ri.activate()
def cqn(matrix, gc_content, lengths): """ Conditional quantile normalization (CQN) with the ``cqn`` R library. It uses GC content and length of regulatory elements as covariates. Requires the R package "cqn" to be installed: .. highlight:: R .. code-block:: R if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("cqn") Parameters ---------- matrix : :class:`pandas.DataFrame` DataFrame to normalize. gc_content : :class:`pandas.Series` Series with GC content of each feature in ``matrix``. lengths : :class:`pandas.Series` Series with length of each feature in ``matrix``. Returns ---------- :class:`pandas.DataFrame` Normalized DataFrame """ from rpy2.robjects import numpy2ri, pandas2ri, r from rpy2.robjects.packages import importr numpy2ri.activate() pandas2ri.activate() importr("cqn") cqn_out = r.cqn(matrix, x=gc_content, lengths=lengths) y_r = cqn_out[list(cqn_out.names).index("y")] y = pd.DataFrame(np.array(y_r), index=matrix.index, columns=matrix.columns) offset_r = cqn_out[list(cqn_out.names).index("offset")] offset = pd.DataFrame(np.array(offset_r), index=matrix.index, columns=matrix.columns) return y + offset
def __init__(self, *args, **kwargs): # Task specific arguments. self.snp_set = kwargs.pop("snp_set_file", None) if self.snp_set: filename = self.snp_set self.snp_set = self._parse_snp_set(self.snp_set) m = ("Using SNP sets from '{}'. Found a total of {} variants in {}" " different SNP sets.") m = m.format(filename, self.snp_set.shape[0], self.snp_set["set"].nunique()) logger.info(m) self.skat_o = kwargs.pop("SKAT-O", False) if self.skat_o: logger.info("Using the SKAT-O test.") # Task initalization using the abstract implementation. super(SKATTest, self).__init__(*args, **kwargs) # Check installation. SKATTest.check_skat() # Import rpy2. from rpy2.robjects import numpy2ri numpy2ri.activate() # Support for numpy arrays. import rpy2.robjects self.robjects = rpy2.robjects self.r = rpy2.robjects.r from rpy2.robjects.packages import importr # Load the SKAT package. try: self.skat = importr("SKAT") except Exception: raise EnvironmentError( 1, "SKAT needs to be installed in your R environment to use " "SKATTest." )
def algorithm(X, y): numpy2ri.activate() rpy.r.assign('X', X) rpy.r.assign('y', y) rpy.r(''' y = as.matrix(y) D = data.frame(X, y) glm(y ~ ., family=binomial(link='probit'), data=D) M = glm(y ~ ., family=binomial(link='probit'), data=D) M0 = glm(y ~ 1, family=binomial(link='probit'), data=D) Mselect = step(M, direction='both', scope=list(upper=M, lower=M0), trace=FALSE) selected_vars = names(coef(Mselect)) ''') selected_vars = ' + '.join(sorted(list(rpy.r('selected_vars')))) selected_vars = selected_vars.replace('(Intercept)', '1') numpy2ri.deactivate() return tuple(selected_vars.split(' + '))
def cqn(matrix, gc_content, lengths): from rpy2.robjects import numpy2ri, pandas2ri, r from rpy2.robjects.packages import importr numpy2ri.activate() pandas2ri.activate() importr("cqn") cqn_out = r.cqn(matrix, x=gc_content, lengths=lengths) y_r = cqn_out[list(cqn_out.names).index("y")] y = pd.DataFrame(np.array(y_r), index=matrix.index, columns=matrix.columns) offset_r = cqn_out[list(cqn_out.names).index("offset")] offset = pd.DataFrame(np.array(offset_r), index=matrix.index, columns=matrix.columns) return y + offset
def gaussian_setup(X, Y, run_CV=True): """ Some calculations that can be reused by methods: lambda.min, lambda.1se, lambda.theory and Reid et al. estimate of noise """ n, p = X.shape Xn = X / np.sqrt((X**2).sum(0))[None, :] numpy2ri.activate() rpy.r.assign('X', X) rpy.r.assign('Y', Y) rpy.r('X=as.matrix(X)') rpy.r('Y=as.numeric(Y)') rpy.r('sigma_ds=estimate_sigma_data_splitting(X,Y)') sigma_ds = rpy.r('sigma_ds') l_theory = np.fabs(Xn.T.dot(np.random.standard_normal( (n, 500)))).max(1).mean() * np.ones(p) * sigma_ds if run_CV: numpy2ri.activate() rpy.r.assign('X', X) rpy.r.assign('Y', Y) rpy.r('X=as.matrix(X)') rpy.r('Y=as.numeric(Y)') rpy.r('G = cv.glmnet(X, Y, intercept=FALSE, standardize=FALSE)') rpy.r( 'sigma_reid = selectiveInference:::estimate_sigma(X, Y, coef(G, s="lambda.min")[-1]) # sigma via Reid et al.' ) rpy.r("L = G[['lambda.min']]") rpy.r("L1 = G[['lambda.1se']]") L = rpy.r('L') L1 = rpy.r('L1') sigma_reid = rpy.r('sigma_reid')[0] numpy2ri.deactivate() return L * np.sqrt(X.shape[0]), L1 * np.sqrt( X.shape[0]), l_theory, sigma_reid else: return None, None, l_theory, None
def python_to_r_object(cls, item): """ 把python对象转化为R对象,类方法 :param item: Python对象,可以转换的类型有:list,tuple,pd.Series, np.ndarray, pd.DataFrame :return: R对象 """ numpy2ri.activate() if isinstance(item,(list, tuple, pd.Series)): return np.array(item) elif isinstance(item, pd.DataFrame): data_dict = {col_names: np.array(item[col_names]) for col_names in item.columns} rdataframe = DataFrame(data_dict) rdataframe.rownames = np.array(item.index) return rdataframe elif isinstance(item, (np.ndarray,bool,int,float,str)): return item else: print('Unsuported type: ',type(item)) raise Exception
def convert_to_r_data(data): # Input is sumu.Data init_r() numpy2ri.activate() datar = r.matrix(data.all().flatten(), nrow=data.N, ncol=data.n, byrow=True) numpy2ri.deactivate() discrete = data.discrete arities = True if data.arities is not False else False datar = r['datapath_or_matrix_to_numeric_dataframe'](datar, discrete=discrete, arities=arities) return datar
def zero_inflate(file): utils = importr("utils") numpy2ri.activate() nr, nc = file.shape Br = ro.r.matrix(file, nrow=nr, ncol=nc) ro.r.assign("tab", Br) zr = ro.r(''' set.seed(1) dat <- tab fo <- as.matrix((dat != 0)) mode(fo) <- "integer" m <- matrix(sample(0:1,nrow(dat)*ncol(dat), replace=TRUE, prob=c(1,3)),nrow(dat),ncol(dat)) dat <- dat*m zr <- fo*(1-m) ''') zeros = ro.r('fo') data = ro.r('dat') numpy2ri.deactivate() return data, zr, zeros
def scran_normalize(adata): import numpy as np import rpy2.robjects as ro from rpy2.robjects import numpy2ri from rpy2.robjects.packages import importr importr('scran') numpy2ri.activate() ro.r.assign('mat', adata.X.T) qclust_params = 'mat' # qclust_params = f'mat, min.size={min_size}, max.size={max_size}' ro.reval(f'cl <- quickCluster({qclust_params})') csf_params = f'mat, clusters=cl' # csf_params = f'mat, clusters=cl, min.mean={min_mean}' sf = np.asarray(ro.reval(f'computeSumFactors({csf_params})')) adata.obs['sf'] = sf adata.layers['counts'] = adata.X.copy() adata.X /= adata.obs['sf'].values[:, None] numpy2ri.deactivate() return adata
def is_Ckmeans_installed(): try: import rpy2 import rpy2.robjects.numpy2ri as numpy2ri try: from importlib import reload reload(rpy2.robjects.numpy2ri) except: pass import rpy2.robjects as ro ro.conversion.py2ri = numpy2ri numpy2ri.activate() from rpy2.robjects.packages import importr importr('Ckmeans.1d.dp') median_seg_func = ro.r('Ckmedian.1d.dp') mean_seg_func = ro.r('Ckmeans.1d.dp') except: return False return True
def find_modules(data, sym=False): """ Take an adjacency matrix, and return the modules that are found and the reordering of the matrix and the corresponding strings for vis.js. """ # If it's a bipartite graph we need to use a different modularity finding # algorithm: http://rsos.royalsocietypublishing.org/content/3/1/140536 if not sym: numpy2ri.activate() res = r_code.run_bivar_modules(np.abs(data.values)) rix = np.array(res[0]) - 1 cix = np.array(res[1]) - 1 rl = np.array(res[2]) cl = np.array(res[3]) numpy2ri.deactivate() # Else we can use the highly popular: https://arxiv.org/abs/0803.0476 else: graph = create_network(r=data, p=None, sym=True, modules=True) part = community.best_partition(graph) # not all indices make it into part. those that don't, will be assigned # to a dummy module with k=max(modules)+1. part_complete = [] part_non_complete = [] max_module_n = max(part.values()) for i in data.index: if i in part: part_complete.append(part[i]) part_non_complete.append(part[i]) else: part_complete.append(max_module_n + 1) # reorder data matrix so modules end up next to each other cix = rix = np.argsort(part_complete) # we only want to keep the true module labels not the dummy ones part_non_complete = np.array(part_non_complete) cl = rl = part_non_complete[np.argsort(part_non_complete)] # transform the selected modules into strings that are understood by vis.js modules = get_mod_strings(rl, cl, sym) return list(rix), list(cix), modules
def map_loinc_system(): if config.print_status == 'Y': print('Mapping LOINC System') if os.path.exists(config.out_dir + "LOINC_System_to_Long.csv"): system_map = pd.read_csv(config.out_dir + "LOINC_System_to_Long.csv", sep="|") else: numpy2ri.activate() stringdist = importr('stringdist', lib_loc=config.lib_loc) loinc_syst = parsed_loinc_fields[['System', 'LongName']] loinc_syst = loinc_syst[(~pd.isnull(loinc_syst.System)) & (loinc_syst.System != '')].reset_index( drop=True) loinc_syst.System = loinc_syst.System.str.split(" ") loinc_syst.LongName = loinc_syst.LongName.str.split(" ") system_tokens = pd.Series([y for x in loinc_syst.System for y in x]).unique() longname_tokens = pd.Series( [y for x in loinc_syst.LongName for y in x]).unique() system_df = pd.DataFrame(0, index=system_tokens, columns=longname_tokens) n_rows = loinc_syst.shape[0] for i in range(n_rows): for j, term in enumerate(loinc_syst.System[i]): dists = stringdist.stringdist(term, loinc_syst.LongName[i], method='jw', p=0) bestMatch = loinc_syst.LongName[i][np.argmin(dists)] system_df.loc[term, bestMatch] = system_df.loc[term, bestMatch] + 1 high_count = system_df.idxmax(axis=1).values system_map = pd.DataFrame({ 'SystemToken': system_tokens, 'SystemMap': high_count }) if config.write_file_loinc_parsed: system_map.to_csv(config.out_dir + "LOINC_System_to_Long.csv", sep="|", index=False) return system_map
def mtcorrect(p_value_dict, **kwargs): """ Apply MT correction. This is a wrapper for R's p.adjust function. Arguments: p_value_dict: a dict with keys = probe names, and values of p-values kwargs: method: MT correction method, from R. See mtcorrect_methods. Default is 'none' Returns: adjusted_p """ continue_flag = True method = test_kwarg('method', kwargs, mtcorrect_methods) try: from rpy2.robjects import r from rpy2 import robjects from rpy2.robjects import numpy2ri numpy2ri.activate() except ImportError: print "ImportError: networkstatistics.mtcorrect() requires a functional rpy2 and R, exiting..." continue_flag = False if continue_flag: row_names = [x for x in p_value_dict.keys()] p_values_list = [p_value_dict[id] for id in row_names] # need to create an r object first p_values_list_r = robjects.FloatVector(p_values_list) # need to assign the r object into the r namespace r.assign('p_values_list_r', p_values_list_r) method_r = mtcorrect_py_2_r_names[method] r('corrected_data = p.adjust(p_values_list_r, method = ' + str(method_r) + ')') adjusted_p = robjects.numpy2ri.ri2numpy(r('corrected_data')) adjusted_p.tolist adjusted_p = {id: adjusted_p[i] for i, id in enumerate(row_names)} return adjusted_p else: return {}
def differential_gene_expression(self, epoch): generated_raw, generated_labels = self.load_samples(epoch) true_raw, true_labels = self.input_data.get_raw_data() import rpy2.robjects as ro from rpy2.robjects.packages import importr from rpy2.robjects import numpy2ri numpy2ri.activate() utils = importr('utils') utils.install_packages('ROTS', repos='http://cran.us.r-project.org') diff_expression = ro.r['source']( "libraries/differential_gene_expression.R")[0] result = diff_expression(true_raw, true_labels, generated_raw, generated_labels) numpy2ri.deactivate() return np.asarray(result)
def estimateFullCovMatrix_mvnmle(data): import rpy2.robjects as robjects from rpy2.robjects.packages import importr from rpy2.robjects import numpy2ri numpy2ri.activate() importr("mvnmle") r = robjects.r # r('data(apple)') robjects.globalenv["data"] = data # robjects.globalenv["data"] = r('apple') # print("my data = ") # print(robjects.globalenv["data"]) covarianceMatrixEstimate = r('''mlest(data)$sigmahat''') # print("covarianceMatrixEstimate = ") # print(covarianceMatrixEstimate) return covarianceMatrixEstimate
def isotonic_unimodal_regression_R(x, y, normalize_data=True): """ Perform unimodal isotonic regression via the Iso package in R """ numpy2ri.activate() # n_instances = x.shape[0] # assert y.shape[0] == n_instances importr('Iso') z = robjects.r["ufit"](y, x=x, type='b') iso_x, iso_y = numpy.array(z.rx2('x')), numpy.array(z.rx2('y')) if normalize_data: auc = numpy.trapz(iso_y, iso_x) iso_y = iso_y / auc assert is_piecewice_linear_pdf(iso_x, iso_y), numpy.trapz(iso_y, iso_x) return iso_x, iso_y
def select(self): active_set = self._method.generate_pvalues()[ 0] # gives us selected variables at 1SE if len(active_set) > 0: numpy2ri.activate() rpy.r.assign("X", self.X[:, active_set]) rpy.r.assign("Y", self.Y) rpy.r.assign("K", self.POSI_K) rpy.r('M = lm(Y ~ X - 1)') rpy.r('L = coef(M) - K * sqrt(diag(vcov(M)))') rpy.r('U = coef(M) + K * sqrt(diag(vcov(M)))') L = rpy.r('L') U = rpy.r('U') numpy2ri.deactivate() pre_select = np.nonzero((L > 0) + (U < 0))[0] selected = [active_set[i] for i in pre_select] return selected, active_set else: return [], []
def install_cl(): """Load the `causalLearning` R package and activate necessary conversion :return: The robject for `causalLearning` """ # robjects.r is a singleton robjects.r.options(download_file_method="curl") numpy2ri.activate() package_names = ["devtools"] utils = rpackages.importr("utils") utils.chooseCRANmirror(ind=0) names_to_install = [ x for x in package_names if not rpackages.isinstalled(x) ] if len(names_to_install) > 0: utils.install_packages(StrVector(names_to_install)) return importr("causalLearning")
def fit(self, X, Y): numpy2ri.activate() rPMA = importr('PMA') typex, typez = _check_penalty_type(self.penalty) X, x_mean, x_std = _center_data(X) Y, y_mean, y_std = _center_data(Y) if self.n_component is None: self.n_component = np.min([X.shape[1], Y.shape[1]]) out = rPMA.CCA(x=X, z=Y, K=self.n_component, \ niter=self.n_iter, standardize=False, \ typex=typex, typez=typez, \ penaltyx=self.C[0], penaltyz=self.C[1], \ trace=False) self.u = numpy2ri.ri2py(out[0]) self.v = numpy2ri.ri2py(out[1]) self._x_score, self._y_score = self.transform(X, Y) self._cancorr = _cancorr(X, Y, self.u, self.v) numpy2ri.deactivate() return self
def generate_pvalues(self): try: numpy2ri.activate() rpy.r.assign('X', self.X) rpy.r.assign('y', self.Y) rpy.r.assign('sigma_reid', self.sigma_reid) rpy.r('y = as.numeric(y)') rpy.r.assign('lam', self.lagrange[0]) rpy.r(''' p = ncol(X); n = nrow(X); sigma_est = 1. if (p >= n) { sigma_est = sigma_reid } else { sigma_est = sigma(lm(y ~ X - 1)) } penalty_factor = rep(1, p); lam = lam / sqrt(n); # lambdas are passed a sqrt(n) free from python code soln = selectiveInference:::solve_problem_glmnet(X, y, lam, penalty_factor=penalty_factor, loss="ls") PVS = selectiveInference:::inference_group_lasso(X, y, soln, groups=1:ncol(X), lambda=lam, penalty_factor=penalty_factor, sigma_est, loss="ls", algo="Q", construct_ci=FALSE) active_vars=PVS$active_vars - 1 # for 0-based pvalues = PVS$pvalues ''') pvalues = np.asarray(rpy.r('pvalues')) active_set = np.asarray(rpy.r('active_vars')) numpy2ri.deactivate() if len(active_set) > 0: return active_set, pvalues else: return [], [] except: return [np.nan], [np.nan] # some R failure occurred
def __init__(self, *args, **kwargs): # Task specific arguments. self.snp_set = kwargs.pop("snp_set_file", None) if self.snp_set: filename = self.snp_set self.snp_set = self._parse_snp_set(self.snp_set) m = ("Using SNP sets from '{}'. Found a total of {} variants in {}" " different SNP sets.") m = m.format(filename, self.snp_set.shape[0], self.snp_set["set"].nunique()) logger.info(m) self.skat_o = kwargs.pop("SKAT-O", False) if self.skat_o: logger.info("Using the SKAT-O test.") # Task initalization using the abstract implementation. super(SKATTest, self).__init__(*args, **kwargs) # Check installation. SKATTest.check_skat() # Import rpy2. from rpy2.robjects import numpy2ri numpy2ri.activate() # Support for numpy arrays. import rpy2.robjects self.robjects = rpy2.robjects self.r = rpy2.robjects.r from rpy2.robjects.packages import importr # Load the SKAT package. try: self.skat = importr("SKAT") except Exception: raise EnvironmentError( 1, "SKAT needs to be installed in your R environment to use " "SKATTest.")
def calculatingAovR(tableFactor, Data, Formula): """Computes and fits an Analysis of Variance Model""" numpy2ri.activate() for t in tableFactor: factorName = t[0] factorType = t[1] factorData = t[2] # sending Data to global variable in R (Factor definition for # Subject, Within or Between Type and FloatVector for Covariate if factorType == 'Covariate': tmp = robjects.FloatVector(factorData) robjects.globalenv[factorName] = tmp else: tmp = robjects.r.factor(factorData) robjects.globalenv[factorName] = tmp DataR = robjects.Matrix(Data.T) robjects.globalenv["DataR"] = DataR TextR = 'aov(%s)' % Formula express = robjects.r.parse(text=TextR) Fit = robjects.r.eval(express) robjects.globalenv["Fit"] = Fit raw = robjects.r.summary(Fit) df = [] for r in raw: for d in r[0][0][:-1]: df.append([int(d), int(r[0][0][-1])]) pValue = np.hstack([np.array([c[4][:-1] for c in r]) for r in raw]) FValue = np.hstack([np.array([c[3][:-1] for c in r]) for r in raw]) terms = [] if len(raw) == 1: for r in raw[0]: for t in r.rownames[0:-1]: terms.append(t.replace(' ', '')) else: for i in raw: for r in i: for t in r.rownames[0:-1]: terms.append(t.replace(' ', '')) return pValue, FValue, terms, df
def calculatingAovR(tableFactor, Data, Formula): """Computes and fits an Analysis of Variance Model""" numpy2ri.activate() for t in tableFactor: factorName = t[0] factorType = t[1] factorData = t[2] # sending Data to global variable in R (Factor definition for # Subject, Within or Between Type and FloatVector for Covariate if factorType == 'Covariate': tmp = robjects.FloatVector(factorData) robjects.globalenv[factorName] = tmp else: tmp = robjects.r.factor(factorData) robjects.globalenv[factorName] = tmp DataR = robjects.Matrix(Data.T) robjects.globalenv["DataR"] = DataR TextR = 'aov(%s)' % Formula express = robjects.r.parse(text=TextR) Fit = robjects.r.eval(express) robjects.globalenv["Fit"] = Fit raw = robjects.r.summary(Fit) df=[] for r in raw: for d in r[0][0][:-1]: df.append([int(d),int(r[0][0][-1])]) pValue = np.hstack([np.array([c[4][:-1] for c in r]) for r in raw]) FValue = np.hstack([np.array([c[3][:-1] for c in r]) for r in raw]) terms = [] if len(raw) == 1: for r in raw[0]: for t in r.rownames[0:-1]: terms.append(t.replace(' ', '')) else: for i in raw: for r in i: for t in r.rownames[0:-1]: terms.append(t.replace(' ', '')) return pValue, FValue, terms,df
def activate(): """ Activate conversion between sparse matrices from Scipy and R’s Matrix package. Does nothing if this is the active conversion. """ global original_converter if original_converter is not None: return original_converter = conversion.converter numpy2ri.activate() new_converter = conversion.Converter("scipy conversion", template=conversion.converter) numpy2ri.deactivate() overlay_converter(converter, new_converter) conversion.set_conversion(new_converter)
def select(self, X, topic_range): if len(topic_range) == 1: if topic_range[0] == 2: topic_range_ = np.array([2, 3]) else: topic_range_ = np.array([2, topic_range[0]]) else: topic_range_ = topic_range numpy2ri.activate() X_r = np.array(X) lda = maptpx.topics(X_r, K=np.array(topic_range_), verb=self.verbose) criteria = np.array(dollar(lda, "BF")) numpy2ri.deactivate() if len(topic_range) == 1: if topic_range[0] == 2: return np.array([criteria[0]]) else: return np.array([criteria[1]]) else: return np.array(criteria)
def calculate_preprocessing(spc_df): numpy2ri.activate() base = importr('base') utils = importr('utils') prospectr = importr('prospectr') # Ignore first 200 * 0.5 = 100 nm, pick every 20 * 0.5 = 10 nm subsample = list(range(200, spc_df.shape[1], 20)) # This can be used only if the original spectral wavelengths are retained, # i.e. not for SG*-based spectra # SG0 sg0 = np.array(prospectr.savitzkyGolay(spc_df.to_numpy(), m=0, w=101, p=3)) # SG1 sg1 = np.array(prospectr.savitzkyGolay(spc_df.to_numpy(), m=1, w=101, p=3)) # Common for both SG filters because they use the same width sg_subsample = list(range(150, sg1.shape[1], 20)) return { "Absorbances": spc_df.iloc[:, subsample].to_numpy(), "Absorbances-SG0-SNV": np.array(prospectr.standardNormalVariate(sg0))[:, sg_subsample], "Absorbances-SG1": sg1[:, sg_subsample], "Absorbances-SG1-SNV": np.array(prospectr.standardNormalVariate(sg1))[:, sg_subsample], "CR": np.array(prospectr.continuumRemoval(spc_df.to_numpy(), type="A"))[:, subsample], "Absorbances-SNV-DT": np.array( prospectr.detrend( spc_df.to_numpy(), wav=rpy2.robjects.FloatVector( spc_df.columns.astype('float').to_numpy())))[:, subsample] }
def probit_MLE(X, y, formula_terms, truth=None, alpha=0.1): numpy2ri.activate() rpy.r.assign('X', X) rpy.r.assign('y', y) rpy.r('D = data.frame(X, y)') rpy.r('M = glm(y ~ %s, family=binomial(link="probit"), data=D)' % ' + '.join(formula_terms)) beta_hat = rpy.r('coef(M)') target_cov = rpy.r('vcov(M)') if truth is None: truth = np.zeros_like(beta_hat) SE = np.sqrt(np.diag(target_cov)) Z = (beta_hat - truth) / SE Z0 = beta_hat / SE pvalues = normal_dbn.cdf(Z0) pvalues = 2 * np.minimum(pvalues, 1 - pvalues) pivots = normal_dbn.cdf(Z) pivots = 2 * np.minimum(pivots, 1 - pivots) upper = beta_hat + normal_dbn.ppf(1 - 0.5 * alpha) * SE lower = beta_hat - normal_dbn.ppf(1 - 0.5 * alpha) * SE covered = (upper > truth) * (lower < truth) results_df = pd.DataFrame({ 'naive_pivot': pivots, 'naive_pvalue': pvalues, 'naive_coverage': covered, 'naive_length': upper - lower, 'naive_upper': upper, 'naive_lower': lower, 'variable': formula_terms, }) return beta_hat, target_cov, results_df
def select(self): numpy2ri.activate() rpy.r.assign('chol_k', self.knockoff_chol) rpy.r(''' knockoffs = function(X) { mu = rep(0, ncol(X)) mu_k = X # sweep(X, 2, mu, "-") %*% SigmaInv_s X_k = mu_k + matrix(rnorm(ncol(X) * nrow(X)), nrow(X)) %*% chol_k return(X_k) } ''') numpy2ri.deactivate() if True: numpy2ri.activate() rpy.r.assign('X', self.X) rpy.r.assign('Y', self.Y) rpy.r.assign('q', self.q) if self.forward_step: rpy.r( 'V = knockoff.filter(X, Y, fdr=q, knockoffs=knockoffs, stat=stat.forward_selection)$selected' ) elif self.sqrt_lasso: rinterface.set_writeconsole_regular(null_print) rpy.r( 'V = knockoff.filter(X, Y, fdr=q, knockoffs=knockoffs, stat=stat.sqrt_lasso)$selected' ) rinterface.set_writeconsole_regular(rinterface.consolePrint) else: rpy.r( 'V = knockoff.filter(X, Y, fdr=q, knockoffs=knockoffs)$selected' ) rpy.r('if (length(V) > 0) {V = V-1}') V = rpy.r('V') numpy2ri.deactivate() return np.asarray(V, np.int), np.asarray(V, np.int) else: # except: return [], []
def cv_glmnet_lam(X, Y, seed=0): """ Some calculations that can be reused by methods: lambda.min, lambda.1se, lambda.theory and Reid et al. estimate of noise """ numpy2ri.activate() rpy.r('set.seed(%d)' % seed) rpy.r.assign('X', X.copy()) rpy.r.assign('Y', Y.copy()) rpy.r('X=as.matrix(X)') rpy.r('Y=as.numeric(Y)') rpy.r('set.seed(1)') rpy.r('G = cv.glmnet(X, Y, intercept=FALSE, standardize=FALSE)') rpy.r("L = G[['lambda.min']]") rpy.r("L1 = G[['lambda.1se']]") L = rpy.r('L') L1 = rpy.r('L1') numpy2ri.deactivate() return float(1.00001 * L[0]), float(1.00001 * L1[0]),
def _get_tree_onepiece_R(points, n_nodes=25): """ Get a tree from a set of points. Wrapping around ElPiGraph.R computeElasticPrincipalTree. """ from rpy2.robjects.packages import importr from rpy2.robjects import numpy2ri elpi = importr("ElPiGraph.R") numpy2ri.activate() tmp = elpi.computeElasticPrincipalTree(X=points, NumNodes=n_nodes, drawAccuracyComplexity=False, drawEnergy=False, drawPCAView=False, verbose=False ) numpy2ri.deactivate() nodes = np.array(tmp[0][0]) edges = np.array(tmp[0][1][0]) - 1 return nodes, edges
def run_minet(filename, algo): rn.activate() code = """library(minet) filename <- '""" + filename + """' first <- readLines(filename, n=1) names <- strsplit(first, '\t') names <- unlist(names, use.names=FALSE) d <- read.table(filename, skip=1, col.names = names) mim <- build.mim(d, estimator = "mi.empirical", disc = "equalfreq") weight_adjacency_matrix <- minet(mim, method='""" + algo + """', estimator="mi.empirical", disc="equalfreq"); weight_adjacency_matrix; """ f = ro.r(code) weight_adjacency_matrix = np.array(f) return weight_adjacency_matrix
def multipleImputationMethod(data, nrImputedDataSets=5): import rpy2.robjects as robjects from rpy2.robjects.packages import importr from rpy2.robjects import numpy2ri numpy2ri.activate() importr("mice") r = robjects.r robjects.globalenv["data"] = data r('''myImputerForData <- mice(data, m = 5, method = 'pmm', seed = 101)''') allDataImputed = [] for i in range(1, nrImputedDataSets + 1): imputedData = r('data.matrix(complete(myImputerForData,' + str(i) + '))') # for the remaining NAN use simple mean imputation imputedData = meanImputation(imputedData) assert (not numpy.any(numpy.isnan(imputedData))) allDataImputed.append(imputedData) return allDataImputed
def naive_estimator(self, active_set): """ selected model """ numpy2ri.activate() if self.model_target == 'selected': rpy.r.assign("X", self.X[:, active_set]) else: n, p = self.X.shape if n > p: rpy.r.assign("X", self.X) else: return (active_set, np.ones(p) * np.nan) rpy.r.assign("Y", self.Y) rpy.r('beta_hat = coef(lm(Y ~ X - 1))') beta_hat = np.asarray(rpy.r('beta_hat')) n, p = self.X.shape beta_full = np.zeros(p) beta_full[active_set] = beta_hat return active_set, beta_full
def __init__(self,step_pattern="symmetric2",window_type=None,window_size=10000, distance_only=False,open_end=False,open_begin=False,rdtw=None): self.step_pattern = step_pattern self.window_type = window_type self.window_size = window_size self.distance_only = distance_only self.open_end = open_end self.open_begin = open_begin # # parameter check # if self.window_type is not None and window_size is None: # raise ValueError("must specify window_size if window_type is not None.") """""" if rdtw is None: # rdtw package object self._rdtw = importr("dtw") else: self._rdtw = rdtw # array conversion activation numpy2ri.activate() pandas2ri.activate() # set window type if it's none if self.window_type is None: self.window_type = "none"
def compute_results(y, X, sigma, active, full_results={}, do_knockoff=False, do_AIC=True, do_BIC=True, do_glmnet=True, alpha=0.05, maxstep=np.inf, compute_maxT_identify=True, burnin=2000, ndraw=8000): n, p = X.shape results, FS = compute_pvalues(y, X, active, sigma, maxstep=maxstep, compute_maxT_identify=compute_maxT_identify, burnin=burnin, ndraw=ndraw) completion_idx = completion_index(results['variable_selected'], active) full_results.setdefault('completion_idx', []).append(completion_idx) for column in results.columns: for i in range(results.shape[0]): full_results.setdefault('%s_%d' % (str(column), i+1), []).append(results[column][i]) for i in range(len(active)): full_results.setdefault('active_%d' % (i+1,), []).append(active[i]) full_results.setdefault('alpha', []).append(alpha) if do_knockoff: # this will probably not work on miller import rpy2.robjects as rpy from rpy2.robjects import numpy2ri rpy.conversion.py2ri = numpy2ri.numpy2ri numpy2ri.activate() rpy.r.assign('X', X) rpy.r.assign('y', y) # knockoff rpy.r.assign('alpha', alpha) knockoff = np.array(rpy.r(""" library(knockoff) knockoff.filter(X = X, y = y, fdr=alpha, knockoffs=create.fixed, offset=0)$selected """)) - 1 knockoff_R = knockoff.shape[0] knockoff_V = knockoff_R - len(set(active).intersection(knockoff)) knockoff_screen = set(knockoff).issuperset(active) knockoff_plus = np.array(rpy.r(""" knockoff.filter(X = X, y = y, fdr=alpha, knockoffs=create.fixed, offset=1)$selected """)) - 1 knockoff_plus_R = knockoff_plus.shape[0] knockoff_plus_V = knockoff_plus_R - len(set(active).intersection(knockoff_plus)) knockoff_plus_screen = set(knockoff_plus).issuperset(active) full_results.setdefault('knockoff_R', []).append(knockoff_R) full_results.setdefault('knockoff_V', []).append(knockoff_V) full_results.setdefault('knockoff_screen', []).append(knockoff_screen) full_results.setdefault('knockoff_plus_R', []).append(knockoff_plus_R) full_results.setdefault('knockoff_plus_V', []).append(knockoff_plus_V) full_results.setdefault('knockoff_plus_screen', []).append(knockoff_plus_screen) numpy2ri.deactivate() if do_AIC: # this will probably not work on miller import rpy2.robjects as rpy from rpy2.robjects import numpy2ri rpy.conversion.py2ri = numpy2ri.numpy2ri numpy2ri.activate() rpy.r.assign('X', X) rpy.r.assign('y', y) rpy.r('''M = step(lm(y ~ 1, data=data.frame(X, y)), scope=list(upper="~ %s"), direction="forward", trace=FALSE)''' % ' + '.join(['X%d' % i for i in range(1, p+1)])) AIC = np.asarray([int(v[1:]) for v in rpy.r("all.vars(M$call$formula[[3]])")]) - 1 # subtract 1 for 0-based indexing AIC_R = AIC.shape[0] AIC_V = AIC_R - len(set(active).intersection(AIC)) AIC_screen = set(AIC).issuperset(active) full_results.setdefault('AIC_R', []).append(AIC_R) full_results.setdefault('AIC_V', []).append(AIC_V) full_results.setdefault('AIC_screen', []).append(AIC_screen) numpy2ri.deactivate() if do_BIC: import rpy2.robjects as rpy from rpy2.robjects import numpy2ri rpy.conversion.py2ri = numpy2ri.numpy2ri numpy2ri.activate() rpy.r.assign('X', X) rpy.r.assign('y', y) rpy.r('''M = step(lm(y ~ 1, data=data.frame(X, y)), scope=list(upper="~ %s"), direction="forward", k=log(nrow(X)), trace=FALSE)''' % ' + '.join(['X%d' % i for i in range(1, p+1)])) BIC = np.asarray([int(v[1:]) for v in rpy.r("all.vars(M$call$formula[[3]])")]) - 1 # subtract 1 for 0-based indexing BIC_R = BIC.shape[0] BIC_V = BIC_R - len(set(active).intersection(BIC)) BIC_screen = set(BIC).issuperset(active) full_results.setdefault('BIC_R', []).append(BIC_R) full_results.setdefault('BIC_V', []).append(BIC_V) full_results.setdefault('BIC_screen', []).append(BIC_screen) numpy2ri.deactivate() if do_glmnet: import rpy2.robjects as rpy from rpy2.robjects import numpy2ri rpy.conversion.py2ri = numpy2ri.numpy2ri numpy2ri.activate() rpy.r.assign('X', X) rpy.r.assign('y', y) rpy.r('''library(glmnet); y = as.matrix(y); X = as.matrix(X); CVG = cv.glmnet(X, y); G = glmnet(X, y); B = coef(G, s=CVG$lambda.min, exact=TRUE); selected = which(B[2:length(B)] != 0); B2 = coef(G, s=CVG$lambda.1se, exact=TRUE); selected2 = which(B2[2:length(B2)] != 0); ''') GLMnet = np.asarray(rpy.r("selected")) - 1 # subtract 1 for 0-based indexing GLMnet_R = GLMnet.shape[0] GLMnet_V = GLMnet_R - len(set(active).intersection(GLMnet)) GLMnet_screen = set(GLMnet).issuperset(active) full_results.setdefault('GLMnet_R', []).append(GLMnet_R) full_results.setdefault('GLMnet_V', []).append(GLMnet_V) full_results.setdefault('GLMnet_screen', []).append(GLMnet_screen) GLMnet1se = np.asarray(rpy.r("selected2")) - 1 # subtract 1 for 0-based indexing GLMnet1se_R = GLMnet1se.shape[0] GLMnet1se_V = GLMnet1se_R - len(set(active).intersection(GLMnet1se)) GLMnet1se_screen = set(GLMnet1se).issuperset(active) full_results.setdefault('GLMnet1se_R', []).append(GLMnet1se_R) full_results.setdefault('GLMnet1se_V', []).append(GLMnet1se_V) full_results.setdefault('GLMnet1se_screen', []).append(GLMnet1se_screen) numpy2ri.deactivate() for pval, rule_ in product(['maxT_identify_pvalue', 'maxT_identify_unknown_pvalue', 'maxT_unknown_pvalue', 'saturated_pvalue', 'nominal_pvalue', 'nominalT_pvalue', 'maxT_pvalue'], zip([simple_stop, strong_stop, forward_stop], ['simple', 'strong', 'forward'])): rule, rule_name = rule_ (R, V_var, V_model, screen, FWER_model, FDP_model, FDP_var, S_var) = summary(np.asarray(results['variable_selected']), results[pval], active, rule, alpha) pval_name = '_'.join(pval.split('_')[:-1]) for (n, value) in zip(['R', 'V_var', 'V_model', 'FDP_model', 'FDP_var', 'S_var', 'FWER_model', 'screen'], [R, V_var, V_model, FDP_model, FDP_var, S_var, FWER_model, screen]): full_results.setdefault('%s_%s_%s' % (pval_name, rule_name, n), []).append(value) return full_results, FS
## This file contains the methods needed to perform the factor anaylsis ## to derive the competencies of the informants. import pandas from sklearn import preprocessing import numpy import rpy2.robjects.packages as rpackages import rpy2.robjects.numpy2ri as np2ri import rpy2.robjects.pandas2ri as pandas2ri import rpy2.robjects as ro np2ri.activate() psych = rpackages.importr('psych') # Convert the string response of each informant into a matrix # with a column for each word, where each cell[i,j] holds # informant i's rank of word j. def buildMatrix(responses): results = {} for i,l in responses.str.split(",").iteritems(): row = {} for pos,j in enumerate(l): row[j] = pos + 1 results[i] = row matrix = pandas.DataFrame(results) matrix = matrix.T return matrix.reset_index(drop=True) # Build a response matrix for each scale, excluding informants who did not # perform the task for that scale def buildMatrices(dataFrame): matrices = {}
Requires: R (3.1+) """ __author__ = "Zhou Fang" __email__ = "*****@*****.**" from sklearn.base import BaseEstimator, ClassifierMixin import numpy as np from sklearn.metrics import r2_score from sklearn.utils.validation import NotFittedError, check_is_fitted from sklearn.utils import check_array, check_X_y from rpy2.robjects import r import rpy2.robjects as robjects import rpy2.robjects.packages as rpackages import rpy2.robjects.numpy2ri as rpyn rpyn.activate() # import R's utility package utils = rpackages.importr("utils") # select a mirror for R packages utils.chooseCRANmirror(ind=1) # select the first mirror in the list # R package names we need packnames = "mgcv" # Install necessary packages that haven't been installed yet packnames_to_install = [x for x in packnames if not rpackages.isinstalled(x)] if len(packnames_to_install) > 0: utils.install_packages(robjects.StrVector(packnames_to_install))
def _exec_r_module(self): try: import rpy2.robjects from rpy2.robjects import numpy2ri from rpy2.robjects import pandas2ri from rpy2.robjects.packages import importr except ImportError: raise ImportError( 'R module cannot be run, because ' '"rpy2" package is not installed.' ) module_name = os.path.splitext(os.path.basename(self.source_file))[0] logger.debug( 'import module "%s" from source file: %s', self.source_file ) logger.debug('source module: "%s"', self.source_file) rpy2.robjects.r('source("{0}")'.format(self.source_file)) module = rpy2.robjects.r[module_name] version = module.get('VERSION')[0] if version != self.handles.version: raise PipelineRunError( 'Version of source and handles is not the same.' ) func = module.get('main') numpy2ri.activate() # enables use of numpy arrays pandas2ri.activate() # enable use of pandas data frames kwargs = self.keyword_arguments logger.debug( 'evaluate main() function with INPUTS: "%s"', '", "'.join(kwargs.keys()) ) # R doesn't have unsigned integer types for k, v in kwargs.iteritems(): if isinstance(v, np.ndarray): if v.dtype == np.uint16 or v.dtype == np.uint8: logging.debug( 'module "%s" input argument "%s": ' 'convert unsigned integer data type to integer', self.name, k ) kwargs[k] = v.astype(int) elif isinstance(v, pd.DataFrame): # TODO: We may have to translate pandas data frames explicitly # into the R equivalent. # pandas2ri.py2ri(v) kwargs[k] = v args = rpy2.robjects.ListVector({k: v for k, v in kwargs.iteritems()}) base = importr('base') r_out = base.do_call(func, args) for handle in self.handles.output: # NOTE: R functions are supposed to return a list. Therefore # we can extract the output argument using rx2(). # The R equivalent would be indexing the list with "[[]]". if isinstance(r_out.rx2(handle.name), rpy2.robjects.vectors.DataFrame): handle.value = pandas2ri.ri2py(r_out.rx2(handle.name)) # handle.value = pd.DataFrame(r_out.rx2(handle.name)) else: # NOTE: R doesn't have an unsigned integer data type. # So we cast to uint16. handle.value = numpy2ri.ri2py(r_out.rx2(handle.name)).astype( np.uint16 ) # handle.value = np.array(r_out.rx2(handle.name), np.uint16) return self.handles.output
def infer_pseudotime(data, output_directory, tag = '', pcv_method = 'Rprincurve', anchor_gene = None, markers = None): assert pcv_method in {'Rprincurve'} # taking into account the possibility of adding # in future versions other methods # for principal curve analysis N_dim = 3 model = TSNE(n_components = N_dim) TSNE_data = model.fit_transform(data) if pcv_method == 'Rprincurve': with open(path.join(output_directory, "{0}_TSNE_d{1}.tsv".format(tag, N_dim)), 'w') as f: f.write('\t'.join(['T{0}'.format(k) for k in xrange(1, N_dim + 1)])) f.write('\n') np.savetxt(f, TSNE_data, fmt = '%.6f', delimiter = '\t') numpy2ri.activate() princurve = importr('princurve') procedure = princurve.principal_curve fitpc = procedure(TSNE_data, NULL, 0.001, TRUE, 200, 2, 'lowess') curve_projections_matrix = np.array(fitpc.rx('s')[0]) pseudotime_series = np.array(fitpc.rx('lambda')[0]) with open(path.join(output_directory, "{0}_TSNE_d{1}_pcv.tsv".format(tag, N_dim)), 'w') as f: np.savetxt(f, curve_projections_matrix, fmt = '%.6f', delimiter = '\t') with open(path.join(output_directory, "{0}_TSNE_d{1}_lambda.tsv".format(tag, N_dim)), 'w') as f: np.savetxt(f, pseudotime_series, fmt = '%.6f', delimiter = '\t') else: print("ERROR: PySCUBA: Preprocessing: infer_pseudotime:\n" "your choice of method for principal curve analysis is not supported " "by the present version of PySCUBA.") exit(1) if anchor_gene: assert isinstance(anchor_gene, str) assert markers is not None N_cells_anchor = 1000 gene_idx = np.where(markers == anchor_gene)[0] pseudotime_idx = np.argsort(pseudotime_series) anchor_gene_avg_beg = np.mean(data[pseudotime_idx[:N_cells_anchor], gene_idx]) anchor_gene_avg_end = np.mean(data[pseudotime_idx[N_cells_anchor:], gene_idx]) if anchor_gene_avg_end > anchor_gene_avg_beg: pseudotime_series = np.max(pseudotime_series) - pseudotime_series t_min = np.min(pseudotime_series) t_max = np.max(pseudotime_series) t_bins = 8 cell_stages = t_bins * (pseudotime_series - t_min + 0.0001) / (t_max - t_min + 0.0002) cell_stages = np.ceil(cell_stages).astype(int).astype('str') return cell_stages
def get_pvalue_from_scores(result_dict, permutation_dict, **kwargs): """ Uses network scores to calculate p-values. Arguments: result_dict: a dict of results from the network propagation. e.g. {node id : score value} permutation_dict: a dict of permutation results, e.g. {node id: [list of score values]} kwargs: verbose returns: ttp_dict: dictionary of p-values from a two-tailed test of significant side_dict: a dict to indicate if the result lies in the upper or lower tail ('+' or '-') """ continue_flag = True verbose = test_kwarg('verbose', kwargs, [False, True]) try: from rpy2.robjects import r from rpy2 import robjects from rpy2.robjects import numpy2ri numpy2ri.activate() except ImportError: print "networkstatistics.get_pvalue_from_scores requires a functional rpy2 and R, exiting..." continue_flag = False if continue_flag: from time import time start_time = time() ttp_dict = {} side_dict = {} # Also require the pareto extrapolation code the_file = 'ParetoExtrapolation.R' import nampy import platform # This may need some fix, I've only tried # this on OSX so far. if platform.system() == 'Windows': pareto_file_path = nampy.__path__[0] + '\\rfiles\\' else: pareto_file_path = nampy.__path__[0] + '/rfiles/' r('source(%s)' %('"' + pareto_file_path + the_file + '"')) test_ids = result_dict.keys() n_tests = len(test_ids) start_time = time() for i, the_id in enumerate(test_ids): test_statistic = result_dict[the_id] null_distribution = permutation_dict[the_id] n_permutations = len(null_distribution) # This won't work by definition if we have less than 20 permutations if n_permutations < 20: if verbose: print 'Warning, null distribution is too small. Run more permutations. Exiting...' return (None, None) M1 = sum(null_distribution > test_statistic) M2 = sum(null_distribution < test_statistic) # Want to know which end the test statistics lies towards if M2 < M1: side = '-' M = M2 else: side = '+' M = M1 if (M >= 10): estimated_p = 2. * M / n_permutations else: # need to create an r object first null_distribution_r = robjects.FloatVector(null_distribution) # need to assign the r object into the r namespace r.assign('null_distribution_r', null_distribution_r) r.assign('test_statistic', test_statistic) # now run our r scripts r('fit = getParetoFit(null_distribution_r, side="two-sided")') r('distCDF = paretoExtrapolate(test_statistic, fit)') estimated_p = robjects.numpy2ri.ri2numpy(r('distCDF'))[0] ttp_dict[the_id] = estimated_p side_dict[the_id] = side if verbose: if (i+1) % 1000 == 0: print 'Test %i of %i, el: %f hr' %((i + 1), n_tests, (time() - start_time)/3600.) return ttp_dict, side_dict else: return {}, {}
from __future__ import division import numpy as np from itertools import izip from csv import DictReader, writer from sys import argv, stdout, stderr from getopt import getopt, GetoptError from rpy2.robjects import packages, numpy2ri # R in Python ## activate R functionality stats = packages.importr('stats') # R stats numpy2ri.activate() # automatic numpy-to-R ## globals DEFAULT_GROUP_COLUMN = '' DEFAULT_OUTPUT_FID = stdout DEFAULT_P_THRESHOLD = .2 MAX_DROP_PCT = .75 USAGE = """ USAGE: {} -a GRP1 -b GRP2 [-d] -m FEATS [-g GCOL] [-o OUTPUT] [-p P] INPUT """.format(__file__) ## helper functions def colmean(x):
def numpy2ri_close(cls): """ 关闭R对象和numpy对象的自动转换 :return: 无返回值 """ numpy2ri.activate()
def setUp(self): #self._py2ri = robjects.conversion.py2ri #self._ri2py = robjects.conversion.ri2py rpyn.activate()
import rpy2.robjects as robjects from rpy2.robjects.packages import importr from rpy2.robjects import numpy2ri numpy2ri.activate() from sklearn.base import BaseEstimator, ClassifierMixin import warnings import numpy as np R = robjects.r class LogisticRegressionCV(BaseEstimator, ClassifierMixin): def __init__(self, binary_classification=True, n_jobs=1, random_state=None, glmnet_params={}): """ binary_classification: if True use family = binomial, else multinomial random_state: int or None, set.seed n_jobs: number of workers glmnet_params: params for glmnet """ self.random_state = random_state self.glmnet_params = glmnet_params self.binary_classification = binary_classification warnings.warn('No validity check for glmnet_params') self.n_jobs = n_jobs def __del__(self): if hasattr(self, 'cluster_'): R['stopCluster'](self.cluster_) def fit(self, X, y): importr('glmnet') family = 'binomial' if self.binary_classification else 'multinomial' if self.random_state is not None: R['set.seed'](self.random_state) if self.n_jobs > 1:
def _init_r(): """Private function to initialise R, only executed when needed. """ global _r_initialised global r global ro global grdevices global ape if not _r_initialised: import rpy2.robjects as ro # noqa from rpy2.robjects import r from rpy2.robjects.packages import importr import rpy2.robjects.numpy2ri as numpy2ri numpy2ri.activate() grdevices = importr('grDevices') ape = importr( 'ape', robject_translations={ 'delta.plot': 'delta_dot_plot', 'dist.dna': 'dist_dot_dna', 'dist.nodes': 'dist_dot_nodes', 'node.depth': 'node_dot_depth', 'node.depth.edgelength': 'node_dot_depth_dot_edgelength', 'node.height': 'node_dot_height', 'node.height.clado': 'node_dot_height_dot_clado', 'prop.part': 'prop_dot_part', } ) # Define custom R functions to help with coloring tree edges by # population. These functions were written by Jacob Almagro-Garcia # <*****@*****.**> at the Wellcome Trust Sanger Institute. r(""" library(ape) ###################################################################################################################### #' Computes the number of leaves of each group that hang from each branch. #' @param phylotree A tree of class phylo. #' @param labelgroups A vector with the group of the tip labels (named with the labels). #' @return A named matrix with the membership counts for each interior edge of the tree. ###################################################################################################################### computeEdgeGroupCounts <- function(phylotree, labelgroups) { labels <- phylotree$tip.label num_tips <- length(labels) edge_names <- unique(sort(c(phylotree$edge))) # This matrix will keep track of the group counts for each edge. edge_group_counts <- matrix(0, nrow=length(edge_names), ncol=length(unique(sort(labelgroups)))) rownames(edge_group_counts) <- edge_names colnames(edge_group_counts) <- unique(labelgroups) # Init the leaf branches. sapply(1:num_tips, function(l) { edge_group_counts[as.character(l), as.character(labelgroups[phylotree$tip.label[l]])] <<- 1 }) # Sort edges by the value of the descendent # The first segment will contain the leaves whereas the second the branches (closer to leaves first). # We need to do this because leaves are numbered 1:num_tips and the branches CLOSER to the leaves # with higher numbers. edges <- phylotree$edge[order(phylotree$edge[,2]),] branches <- edges[num_tips:nrow(edges),] edges[num_tips:nrow(edges),] <- branches[order(branches[,1],decreasing=T),] invisible(apply(edges, 1, function(edge) { # Check if we are connecting a leaf. if(edge[2] <= num_tips) { e <- as.character(edge[1]) g <- as.character(labelgroups[phylotree$tip.label[edge[2]]]) edge_group_counts[e,g] <<- edge_group_counts[e,g] + 1 } else { e1 <- as.character(edge[1]) e2 <- as.character(edge[2]) edge_group_counts[e1,] <<- edge_group_counts[e1,] + edge_group_counts[e2,] } })) return(edge_group_counts) } ###################################################################################################################### #' Assigns the color of the majority group (hanging from) each branch. #' @param phylotree A tree of class phylo. #' @param edge_group_counts A named matrix with the group counts for each branch. #' @param groupcolors A named vector with the color of each group. #' @param equality_color The color to be used if there is no majority group. #' @return A vector with the colors to be used with the tree branches. ###################################################################################################################### assignMajorityGroupColorToEdges <- function(phylotree, edge_group_counts, groupcolors, equality_color="gray") { edge_colors <- apply(phylotree$edge, 1, function(branch) { e <- as.character(branch[2]) major_group_index <- which.max(edge_group_counts[e,]) if(all(edge_group_counts[e,] == edge_group_counts[e,major_group_index])) return(equality_color) else return(groupcolors[colnames(edge_group_counts)[major_group_index]]) }) return(edge_colors) } """) # noqa _r_initialised = True