def _pr_rc_curve_r(observations, predictions, FDRth=0.05): """ :param observations: known truth set :param predictions: all data :param FDRth: :return: """ obs_rtbl = numpy2ri.py2ri(observations) prd_rtbl = numpy2ri.py2ri(predictions) curve_prm = {'scores.class0': prd_rtbl, 'weights.class0': obs_rtbl, 'curve': True, 'sorted': True} prc = PRROC.pr_curve(**curve_prm) auc = prc.rx2('auc.integral')[0] curve = numpy2ri.ri2py(prc.rx2('curve')) cols = ['recall', 'precision', 'threshold'] df = pd.DataFrame(curve, columns=cols) FDR5percTh = - (df[df.precision >= (1 - FDRth)])['threshold'].min() if not np.isnan(FDR5percTh): index_min = min(df[df.precision >= (1 - FDRth)].index.tolist()) else: index_min = 0 SENS = df.at[index_min, 'recall'] threshold = -FDR5percTh return df, auc, SENS, FDR5percTh
def plot_qc_metrics(self, output_dir): """Plot QC results from ENmix pipeline and possible minfi. Still experimental. Parameters ---------- output_dir Where to store plots.""" self.enmix.plotCtrl(self.RGset) grdevice = importr("grDevices") geneplotter = importr("geneplotter") base = importr('base') anno=self.minfi.getAnnotation(self.RGset) anno_py = pandas2ri.ri2py(robjects.r['as'](anno,'data.frame')) beta_py = pandas2ri.ri2py(self.beta) beta1=numpy2ri.py2ri(beta_py[anno_py["Type"]=="I"]) beta2=numpy2ri.py2ri(beta_py[anno_py["Type"]=="II"]) grdevice.jpeg(output_dir+'/dist.jpg',height=900,width=600) base.par(mfrow=robjects.vectors.IntVector([3,2])) self.enmix.multidensity(self.beta, main="Multidensity") self.enmix.multifreqpoly(self.beta, xlab="Beta value") self.enmix.multidensity(beta1, main="Multidensity: Infinium I") self.enmix.multifreqpoly(beta1, main="Multidensity: Infinium I", xlab="Beta value") self.enmix.multidensity(beta2, main="Multidensity: Infinium II") self.enmix.multifreqpoly(beta2, main="Multidensity: Infinium II", xlab="Beta value") grdevice.dev_off() self.minfi.qcReport(self.RGset, pdf = "{}/qcReport.pdf".format(output_dir)) self.minfi.mdsPlot(self.RGset) self.minfi.densityPlot(self.RGset, main='Beta', xlab='Beta')
def vs_sample_vec(y_arr, dat_arr, w=None, p=0.5): """Sample Variogram Score; vectorized version Compute the variogram score VS(*y_arr*, *dat_arr*), where *y_arr* is a series of *d*-dimensional observations and *dat_arr* is a series of samples of multivariate forecasts. For details, see Scheuerer, M. and Hamill, T.M. (2015). Variogram-based proper scoring rules for probabilistic forecasts of multivariate quantities. Monthly Weather Review, 143, 1321-1334. Args: *y_arr* (np.array): Series of observations of shape (*d*, *n*), where *d* is the dimension of the observations, and *n* the number of observation. Hence each column contains a single *d*-dimensional realization. *dat_arr* (np.array): Forecast sample of shape (*d*, *m*, *n*), where *d* is the dimension of the realized values, *m* the number of samples, and *n* the number of realizations. *p* (float): Order of variogram score. Standard choices include *p* = 1 and *p* = 0.5 (default). *w* (np.array): Numeric array of weights for *dat* used in the variogram score. If no weights are specified, constant weights with *w* = 1 are used. Returns: *np.array*: Variogram score of each forecast-observation pair. """ try: y_arr = np.array(y_arr) y_arr = np.expand_dims(y_arr, 1) dat_arr = np.array(dat_arr) p_r = float(p) if w is None : w_r = rpy2.robjects.NULL else: w = np.array(w) w_r = np2ri.py2ri(w) except Exception: print('Input has wrong format.') else: if (len(y_arr.shape) != 3 or len(dat_arr.shape) != 3 or y_arr.shape[0] != dat_arr.shape[0] or y_arr.shape[2] != dat_arr.shape[2] ): raise ValueError('Parameters have wrong dimension.') df = np.concatenate((y_arr,dat_arr),axis = 1) df_r = np2ri.py2ri(df) rpy2.robjects.globalenv['df'] = df_r rpy2.robjects.globalenv['p'] = p_r rpy2.robjects.globalenv['w'] = w_r vscr_r = rpy2.robjects.r('apply(df, c(3), function(x) vs_sample(x[,1], x[,-1], w, p))') return np.array(vscr_r)
def _roc_curve_r(observations, predictions, FDRth=0.05): """ :param observations: known truth set :param predictions: all data :param FDRth: :return: """ obs_rtbl = numpy2ri.py2ri(observations) prd_rtbl = numpy2ri.py2ri(predictions) roc_prm = {'direction': '>'} RES = pROC.roc(obs_rtbl, prd_rtbl, **roc_prm) auc = pandas2ri.ri2py(RES.rx2('auc'))[0] columns = ['threshold', 'ppv', 'sensitivity', 'specificity'] coor_prm = {'ret': r.c('threshold', 'ppv', 'sensitivity', 'specificity')} COORS = pROC.coords(RES, 'all', **coor_prm) cords = numpy2ri.ri2py(COORS) df = pd.DataFrame(cords.T, columns=columns) FDR5percTh = (df[df.ppv >= (1 - FDRth)])['threshold'].max() if not np.isnan(FDR5percTh): index_min = min(df[df.threshold <= FDR5percTh].index.tolist()) else: index_min = 0 threshold = df.at[index_min, 'threshold'] SENS = df.at[index_min, 'sensitivity'] SPEC = df.at[index_min, 'specificity'] return df, auc, SENS, FDR5percTh
def pd_py2ri(o): """ """ res = None if isinstance(o, pd.Series): o = pd.DataFrame(o, index=o.index) if isinstance(o, pd.DataFrame): if isinstance(o.index, pd.DatetimeIndex): res = rconv.convert_df_to_xts(o) else: res = rcom.convert_to_r_dataframe(o) if isinstance(o, pd.DatetimeIndex): res = rconv.convert_datetime_index(o) if isinstance(o, pd.Timestamp): res = rconv.convert_timestamp(o) if res is None: try: res = numpy2ri.py2ri(o) except: res = robjects.default_converter.py2ri(o) return res
def _infer_network(self, data): """ Infer the network. Args: data (pd.DataFrame): data to be used for the inference. """ # activate implicit conversion from pandas to R objects pandas2ri.activate() genie3 = importr('GENIE3') importr('foreach') importr('doParallel') # transform pandas dataframe into GENIE3 input format globalenv['r_matrix'] = numpy2ri.py2ri(data.T.values) globalenv['r_rows'] = data.columns globalenv['r_cols'] = data.index r(''' rownames(r_matrix) <- c(r_rows) colnames(r_matrix) <- c(r_cols) ''') expr_matrix = globalenv['r_matrix'] # run GENIE3 values = numpy2ri.ri2py( genie3.GENIE3(expr_matrix, self.regulators, self.targets, self.tree_method, self.k, self.n_trees, self.n_cores, self.verbose)) weight_matrix = pd.DataFrame(values, columns=data.columns, index=data.columns) self.graph = Graph(adjacency=weight_matrix) logger.debug('inferred with {}'.format(self.method))
def es_sample(y, dat): """Sample Energy Score Compute the energy score ES(*y*, *dat*), where *y* is a vector of a *d*-dimensional observation and dat is a multivariate ensemble forecast. For details, see Gneiting, T., Stanberry, L.I., Grimit, E.P., Held, L. and Johnson, N.A. (2008). Assessing probabilistic forecasts of multivariate quantities, with an application to ensemble predictions of surface winds. Test, 17, 211–235. Args: *y* (np.array): Realized values (numeric vector of length *d*). *dat* (np.array): Forecast sample of shape (*d*, *m*), where *d* is the dimension of the realization and *m* the number of sample members. Each of the *m* columns corresponds to the *d*-dimensional forecast of one ensemble member. Returns: float: Energy score of the forecast-observation pair. """ try: y = np.array(y) dat = np.array(dat) y_r = rpy2.robjects.FloatVector(y) dat_r = np2ri.py2ri(dat) except Exception: print('Input has wrong format.') return srl.es_sample(y_r, dat_r)[0]
def vs_sample(y, dat, w=None, p=0.5): """Sample Variogram Score Compute the variogram score VS(*y*, *dat*) of order *p*, where *y* is a *d*-dimensional observation and dat is a multivariate ensemble forecast. For details, see Scheuerer, M. and Hamill, T.M. (2015). Variogram-based proper scoring rules for probabilistic forecasts of multivariate quantities. Monthly Weather Review, 143, 1321-1334. Args: *y* (np.array): Observation (numeric vector of length *d*). *dat* (np.array): Forecast sample of shape (*d*, *m*), where *d* is the dimension of the realization and *m* the number of sample members. *p* (float): Order of variogram score. Standard choices include *p* = 1 and *p* = 0.5 (default). *w* (np.array): Numeric array of weights for *dat* used in the variogram score. If no weights are specified, constant weights with *w* = 1 are used. Returns: float: Variogram score of the forecast-observation pair. """ try: y = np.array(y) dat = np.array(dat) if w is None : w_r = rpy2.robjects.NULL else: w = np.array(w) w_r = np2ri.py2ri(w) p_r = float(p) y_r = rpy2.robjects.FloatVector(y) dat_r = np2ri.py2ri(dat) except Exception: print('Input has wrong format.') return srl.vs_sample(y = y_r, dat = dat_r, w = w_r, p = p_r)[0]
def es_sample_vec(y_arr, dat_arr): """Sample Energy Score; vectorized version Compute the energy score ES(*y_arr*, *dat_arr*), where *y_arr* is a series of *d*-dimensional observations and *dat_arr* is a series of samples of multivariate forecasts. For details, see Gneiting, T., Stanberry, L.I., Grimit, E.P., Held, L. and Johnson, N.A. (2008). Assessing probabilistic forecasts of multivariate quantities, with an application to ensemble predictions of surface winds. Test, 17, 211-235. Args: *y_arr* (np.array): Series of observations of shape (*d*, *n*), where *d* is the dimension of the observations, and *n* the number of observation. Hence each column contains a single *d*-dimensional realization. *dat_arr* (np.array): Forecast sample of shape (*d*, *m*, *n*), where *d* is the dimension of the realized values, *m* the number of samples, and *n* the number of realizations. Returns: np.array: Energy score of each forecast-observation pair. """ try: y_arr = np.array(y_arr) y_arr = np.expand_dims(y_arr, 1) dat_arr = np.array(dat_arr) except Exception: print('Input has wrong format.') else: if (len(y_arr.shape) != 3 or len(dat_arr.shape) != 3 or y_arr.shape[0] != dat_arr.shape[0] or y_arr.shape[2] != dat_arr.shape[2] ): raise ValueError('Parameters have wrong dimension.') df = np.concatenate((y_arr,dat_arr),axis = 1) df_r = np2ri.py2ri(df) rpy2.robjects.globalenv['df'] = df_r escr_r = rpy2.robjects.r('apply(df, c(3), function(x) es_sample(x[,1], x[,-1]))') return np.array(escr_r)
def crps_sample_vec(y_arr, dat_arr): """Sample Continuous Ranked Probability Score (CRPS); vectorized version Compute CRPS(*y_arr*, *dat_arr*), where *y_arr* is a series of univariate observations and *dat_arr* is a series of ensemble forecasts. For details, see Matheson, J.E. and Winkler, R.L. (1976). Scoring rules for continuous probability distributions. Management Science, 22, 1087-1096. Args: *y_arr* (np.array): Series of observations of length *n*, where *n* is the number of observations. *dat_arr* (np.array): Ensemble forecasts of shape (*m*, *n*), where *m* is the number of ensemble members, and *n* the number of observation. Returns: np.array: CRPS of each forecast-observation pair. """ try: y_arr = np.array(y_arr) dat_arr = np.array(dat_arr) y_r = rpy2.robjects.FloatVector(y_arr) dat_r = np2ri.py2ri(dat_arr) except Exception: print('Input has wrong format.') else: if (len(y_arr.shape) != 1 or len(dat_arr.shape) != 2 or y_arr.shape[0] != dat_arr.shape[1] ): raise ValueError('Parameters have wrong dimension.') rpy2.robjects.globalenv['obs'] = y_r rpy2.robjects.globalenv['forc'] = dat_r crps_r = rpy2.robjects.r('apply(rbind(obs,forc), 2, function(x) crps_sample(x[1], x[-1]))') return np.array(crps_r)
def Rdeepnet(train, train_class, test, hidden_N, nepoch): import rpy2.robjects as robjects robjects.r(''' dp <- function(hidden_N,nepoch,train,train_class,test) #train,class,test) { a<- Sys.time() print(hidden_N) labelNames = c("Shopping","Food") predictions <- matrix(0,nrow=nrow(test), ncol=length(labelNames)) predictions <- data.frame(predictions) #colnames(predictions) <- labelNames set.seed(1) library(deepnet) nn <- nn.train(as.matrix(train),as.matrix(train_class),hidden = hidden_N,numepochs = nepoch) predictions <- nn.predict(nn,test) predictions <- round(predictions) b<- Sys.time() print(b-a) return(predictions) } ''') r_f = robjects.globalenv['dp'] #print(r_f.r_repr()) r_f = robjects.r['dp'] from rpy2.robjects import pandas2ri pandas2ri.activate() train = pandas2ri.py2ri(train) #converts pandas df to R dataframe test = pandas2ri.py2ri(test) train_class = pandas2ri.py2ri(train_class) from rpy2.robjects import numpy2ri #converts list in python to R vectors numpy2ri.activate() hidden_N = np.array(hidden_N) nepoch = nepoch hidden_N = numpy2ri.py2ri(hidden_N) predictions = pandas2ri.ri2py_dataframe( r_f(hidden_N, nepoch, train, train_class, test)) return predictions
remained = x[-top[i, :] > -1] for j in range(len(remained)): combination = np.zeros(bioN) tmpselected = np.append(selected, remained[j]) combination[tmpselected - 1] = 1 comb = np.concatenate(([combination], comb)) return comb ## implentment iteration thresholdN = 10 iterationT = 2 while (iterationT < thresholdN): iterationT = iterationT + 1 comb = prepareCombination(topres, bioN) rcomb = numpy2ri.py2ri(comb) rcomb = robjects.Matrix(rcomb) robjects.globalenv['rcomb'] = rcomb rscript_calC = ''' rcomb <- data.frame(rcomb) starttimeC<-Sys.time() resC<-func_bycb(rcomb) endtimeC<-Sys.time() ctimeC<-endtimeC-starttimeC ''' robjects.r(rscript_calC) #print(robjects.r['head']('rcomb')) npresC = np.array(robjects.r['resC']) npresC = np.reshape(npresC, newshape=(npresC.shape[0], npresC.shape[1])) npresC = np.transpose(npresC) npres = np.concatenate((npres, npresC), axis=0)