def anova(self, design, formula, heteroscedasticity_threshold = 0.05): # Checking for errors in R # TODO: Deal better with this, catch actual exceptions try: info("Anova Formula in Python: " + str(formula)) info("Anova Formula in R: " + str(Formula(formula))) aov_data = self.prune_data(self.complete_design_data) if self.test_heteroscedasticity(aov_data, formula, heteroscedasticity_threshold): regression = self.transform_lm(aov_data, formula) else: regression = self.stats.aov(Formula(formula), aov_data) if regression == None: regression = self.stats.aov(Formula(formula), aov_data) summary_regression = self.stats.summary_aov(regression) info("Regression Step:" + str(summary_regression)) prf_values = {} for k, v in zip(self.base.rownames(summary_regression[0]), summary_regression[0][4]): if k.strip() != "Residuals": prf_values[k.strip()] = v except: info("Regression Step Failed!") regression = None prf_values = None return regression, prf_values
def build_drf_model(self, x_old, y): from rpy2.robjects.vectors import StrVector, FactorVector, FloatVector, IntVector from rpy2.robjects import Formula, pandas2ri x, ts = x_old[:, :-1], x_old[:, -1] tmp = np.concatenate( [x, np.reshape(ts, (-1, 1)), np.reshape(y, (-1, 1))], axis=-1) data_frame = pandas2ri.py2ri( Baseline.to_data_frame( tmp, column_names=np.arange(0, tmp.shape[-1] - 2).tolist() + ["T", "Y"])) result = self.gps.hi_est( Y="Y", treat="T", treat_formula=Formula('T ~ ' + '+'.join(data_frame.names[:-2])), outcome_formula=Formula('Y ~ T + I(T^2) + gps + T * gps'), data=data_frame, grid_val=FloatVector([float(tt) for tt in np.linspace(0, 1, 256)]), treat_mod="Normal", link_function="log" ) # link_function is not used with treat_mod = "Normal". treatment_model, model = result[1], result[2] fitted_values = treatment_model.rx2('fitted.values') distribution = norm(np.mean(fitted_values), np.std(fitted_values)) return distribution, model
def fetch_stats_totals(des, qn_f, r): total_ci = svyciprop_xlogit(Formula(qn_f), des, multicore=False) # extract stats logger.info('fetching stats totals', r=r, q=qn_f) cts = rsvy.svyby(Formula(qn_f), Formula(qn_f), des, rsvy.unwtd_count, na_rm=True, na_rm_by=True, na_rm_all=True, multicore=False) cts = pandas2ri.ri2py(cts) cols = ['eql', 'ct', 'se_ignore'] cts.columns = cols ct = cts.ct[cts.eql == 1].sum() ss = cts.ct.sum() res = {'level': 0, 'response': r, 'mean': u.guard_nan( rbase.as_numeric(total_ci)[0]) if total_ci else None, 'se': u.guard_nan( rsvy.SE(total_ci)[0]) if total_ci else None, 'ci_l': u.guard_nan( rbase.attr(total_ci, 'ci')[0]) if total_ci else None, 'ci_u': u.guard_nan( rbase.attr(total_ci, 'ci')[1]) if total_ci else None, 'count': ct, 'sample_size': ss } # round as appropriate logger.info('finished computation lvl1', res=res, total_ci=total_ci, ct=ct, ss=ss) res = pd.DataFrame([res]).round(DECIMALS) return u.fill_none(res)
def opt_federov(self, design_formula, trials, data, max_iterations = 1000000, nullify = 0): info("Starting \"optFederov\" run") info("Using Search Space:") info(str(self.utils.str(data))) formulas = {} for parameter in self.parameter_ranges.keys(): formulas["{0}e".format(parameter)] = Formula("{0}e ~ ({0} - {1}) / {1}".format(parameter, (self.parameter_ranges[parameter][1] - 1.0) / 2.0)) info("Encoding formulas: " + str(self.utils.str(ListVector(formulas)))) info("Data Dimensions: " + str(self.base.dim(data))) coded_data = self.rsm.coded_data(data, formulas = ListVector(formulas)) info("Coded data: " + str(self.utils.str(coded_data))) output = self.algdesign.optFederov(frml = Formula(design_formula), data = coded_data, nTrials = trials, nullify = nullify, nRepeats = 10, maxIteration = max_iterations) return output
def __init__(self, formula_str, df, factors=None, resid_formula_str=None, **lmer_opts): """ """ # get the pred_var pred_var = formula_str.split('~')[0].strip() # convert df to a recarray if it's a dataframe if isinstance(df, pd.DataFrame): df = df.to_records() # add column if necessary if pred_var not in df.dtype.names: # must add it df = append_fields(df, pred_var, [0.0] * len(df), usemask=False) # make factor list if necessary if factors is None: factors = {} # add in missingarg for any potential factor not provided for k in df.dtype.names: if isinstance(df[k][0], str) and k not in factors: factors[k] = MissingArg for f in factors: if factors[f] is None: factors[f] = MissingArg # checking for both types of R Vectors for rpy2 variations elif (not isinstance(factors[f], Vector) and not factors[f] == MissingArg): factors[f] = Vector(factors[f]) # convert the recarray to a DataFrame (releveling if desired) self._rdf = DataFrame({ k: (FactorVector(df[k], levels=factors[k]) if (k in factors) or isinstance(df[k][0], str) else df[k]) for k in df.dtype.names }) # get the column index self._col_ind = list(self._rdf.colnames).index(pred_var) # make a formula obj self._rformula = Formula(formula_str) # make one for resid if necessary if resid_formula_str: self._rformula_resid = Formula(resid_formula_str) else: self._rformula_resid = None # save the args self._lmer_opts = lmer_opts # model is null to start self._ms = None
def _limma(data: pd.DataFrame, design: pd.DataFrame, alpha: float = 0.05, adjust_method: str = 'fdr_bh') -> pd.DataFrame: """Wrap limma to perform single sample DE analysis.""" # Import R libraries base = importr('base') stats = importr('stats') try: limma = importr('limma') except RRuntimeError as e: click.echo(e) click.echo("Please check if limma package is installed in R. \n If not, follow the instructions from LINK " "HERE.") sys.exit(1) # Convert data and design pandas dataframes to R dataframes with localconverter(ro.default_converter + pandas2ri.converter): r_data = ro.conversion.py2rpy(data) r_design = ro.conversion.py2rpy(design) # Use the genes index column from data as a R String Vector genes = ro.StrVector( [ str(index) for index in data.index.tolist() ] ) # Create a model matrix using design's Target column using the R formula "~0 + f" to get all the unique factors # as columns f = base.factor(r_design.rx2('Target'), levels=base.unique(r_design.rx2('Target'))) form = Formula('~0 + f') form.environment['f'] = f r_design = stats.model_matrix(form) r_design.colnames = base.levels(f) # Fit the data to the design using lmFit from limma fit = limma.lmFit(r_data, r_design) # Make a contrasts matrix with the 1st and the last unique values contrast_matrix = limma.makeContrasts(f"{r_design.colnames[0]}-{r_design.colnames[-1]}", levels=r_design) # Fit the contrasts matrix to the lmFit data & calculate the bayesian fit fit2 = limma.contrasts_fit(fit, contrast_matrix) fit2 = limma.eBayes(fit2) # topTreat the bayesian fit using the contrasts and add the genelist r_output = limma.topTreat(fit2, coef=1, genelist=genes, number=np.Inf) # Convert R dataframe to Pandas with localconverter(ro.default_converter + pandas2ri.converter): output = ro.conversion.rpy2py(r_output) # Adjust P value with the provided adjusted method output['adj.P.Val'] = multipletests(output['P.Value'], alpha=alpha, method=adjust_method)[1] output['logFC'].loc[output['adj.P.Val'] > 0.05] = 0 output['logFC'].loc[np.abs(output['logFC']) < 1.3] = 0 return output
def dirichletreg_df(prop_df, covar_df, formula, onevsrest_category=None, return_reg_input=False): from rpy2.robjects import r, Formula from rpy2.robjects.packages import importr from rpy2.rinterface_lib.callbacks import logger as rpy2_logger dr = importr('DirichletReg') dr_df = pd.concat([prop_df, covar_df], axis=1) f = Formula(formula) rpy2_logger.setLevel( logging.ERROR) # will display errors, but not warnings f.environment['y'] = dr.DR_data(py2r(prop_df)) rpy2_logger.setLevel( logging.WARNING) # will display errors, but not warnings if onevsrest_category is None: fit = dr.DirichReg(f, py2r(dr_df)) else: assert onevsrest_category in prop_df.columns cat_index = prop_df.columns.tolist().index(onevsrest_category) + 1 fit = dr.DirichReg(f, py2r(dr_df), model='alternative', **{'sub.comp': cat_index}) r.sink(file='/dev/null') u = r.summary(fit) r.sink() if r('sink.number')()[0] > 0: r.sink() if onevsrest_category is None: varnames = u.rx2('varnames') else: varnames = [onevsrest_category] * 2 coef_mat = u.rx2('coef.mat') rows = r2py(r('rownames')(coef_mat)) coef_df = r2py(r('as.data.frame')(coef_mat)).reset_index(drop=True) coef_df.columns = ['coefficient', 'se', 'zval', 'pval'] coef_df['compartment'] = np.repeat(varnames, r2py(u.rx2('n.vars'))) coef_df['variable'] = rows coef_df['significance'] = bin_pval(coef_df.pval) if onevsrest_category is not None: coef_df['coef_type'] = np.repeat(['mean', 'precision'], r2py(u.rx2('n.vars'))) if return_reg_input: return dr_df, coef_df else: return coef_df
def des_from_survey_db(tbl, db, host, port, denovo=False, fpc=False,design='cluster'): strata = '~strata' if denovo: strata = '~yr+sitecode' return rsvy.svydesign(id=Formula('~psu'), weight=Formula('~weight'), strata=Formula(strata), nest=True, fpc=(Formula('~fpc') if fpc else ro.NULL), data=tbl, dbname=db, host=host, port=port, dbtype='MonetDB.R')
def fit(self, train_data, labels, formula = "class ~ .", feature_names = ""): # train should be a dataframe and labels a numpy.array train_data = pd.DataFrame(train_data) train_data["class"] = labels with localconverter(ro.default_converter + pandas2ri.converter): train_R = r_from_pd_df = ro.conversion.py2rpy(train_data) if type(formula) == type("string"): if feature_names: formula = Formula("class ~" + "+ ".join(feature_names)) else: formula = Formula(formula) self.trained = self.r_model(formula, data = train_R, scale = True, type = "eps-regression", kernel = "radial") return self.trained
def fit(self, train_data, labels, formula = "class ~ .", feature_names = "", test_data = None, kernel = "gaussian"): # train should be a dataframe and labels a numpy.array train_data = pd.DataFrame(train_data) train_data["class"] = labels with localconverter(ro.default_converter + pandas2ri.converter): self.train_R = r_from_pd_df = ro.conversion.py2rpy(train_data) if type(formula) == type("string"): if feature_names: formula = Formula("class ~" + "+ ".join(feature_names)) else: formula = Formula(formula) self.formula = formula self.kernel = kernel return self.r_model
def from_rdf(cls, spss_file, rdf): logging.info('loading column definitions') svy_cols = parse_fwfcols_spss(spss_file) logging.info('loading variable annotations') svy_vars = parse_surveyvars_spss(spss_file) logging.info('creating survey design from data and annotations') des = rsvy.svydesign(id=Formula('~psu'), weight=Formula('~weight'), strata=Formula('~stratum'), data=rdf, nest=True) return cls(des=des, vars=svy_vars, rdf=rdf)
def Xctree(RESPONSE__, datatrain__, datatest__=None, VERBOSE=False, TREE_EXPORT=True): Prx__=None ACCx__=None CFx__=None fmla__ = Formula(RESPONSE__+' ~ .') CT__ = ctree(fmla__, data=datatrain__) Pr__,ACC__,CF__= getresponseframe(datatrain__,CT__, RESPONSE__,olddata=True) if datatest__ is not None: Prx__,ACCx__,CFx__= getresponseframe(datatest__,CT__, RESPONSE__) TR__= visTree(CT__,Pr__, PLOT=False, VERBOSE=VERBOSE,ACC=ACC__,ACCx=ACCx__,RESP_=RESPONSE__) if TR__ is not None: if TREE_EXPORT: tree_export(TR__,TYPE='polyline',EXEC=True) return CT__,Pr__,ACC__,CF__,Prx__,ACCx__,CFx__,TR__
def __init__(self, count_matrix, design_matrix, design_formula, gene_column='id'): try: assert gene_column in count_matrix.columns, 'Wrong gene id column name' except AttributeError: sys.exit('Wrong Pandas dataframe?') self.dds = None self.deseq_result = None self.comparison = None self.normalized_count_matrix = None self.gene_column = gene_column self.gene_id = count_matrix[self.gene_column] count_matrix = count_matrix.drop(gene_column, axis=1) print(f'Number of columns in counts data {count_matrix.shape[1]} | ' f'Number of rows in design matrix {design_matrix.shape[0]}') # Load dataframe into R environment # Important: Change to r.data() if you use numpys and rpy2 latests versions self.count_matrix = rpy2.robjects.conversion.py2rpy(count_matrix) # Assign columns to NULL self.count_matrix.names = rpy2.rinterface.NULL self.count_matrix = count_matrix self.design_matrix = rpy2.robjects.conversion.py2rpy(design_matrix) self.design_formula = Formula(design_formula)
def predict_best_values(self, regression, size, fixed_variables, ordered_prf_keys, prf_values, heteroscedasticity_threshold = 0.05): unique_variables = self.get_ordered_fixed_terms(ordered_prf_keys, prf_values) info("Predicting Best Values for: " + str(unique_variables)) if unique_variables == []: model = ". ~ ." else: model = ". ~ " + " + ".join(unique_variables) info("Using Model: " + str(model)) regression = self.stats.update(regression, Formula(model)) summary_regression = self.stats.summary_aov(regression) info("Prediction Regression Step:" + str(summary_regression)) #TODO only look at the target variables data = self.generate_valid_sample(size, fixed_variables) predicted = self.stats.predict(regression, data) predicted_min = min(predicted) pruned_data = data.rx(predicted.ro == self.base.min(predicted), True) return pruned_data.rx(1, True)
def _gam_fit_predict(cls, x, y, weights=None, pred_x=None): # Weights if weights is None: weights = np.repeat(1.0, len(x)) # Construct dataframe use_inds = np.where(weights > 0)[0] r_df = pandas2ri.py2ri( pd.DataFrame(np.array([x, y]).T[use_inds, :], columns=['x', 'y'])) # Fit the model rgam = importr('gam') model = rgam.gam(Formula('y~s(x)'), data=r_df, weights=pd.Series(weights[use_inds])) # Predictions if pred_x is None: pred_x = x y_pred = np.array( robjects.r.predict(model, newdata=pandas2ri.py2ri( pd.DataFrame(pred_x, columns=['x'])))) deviance = np.array(robjects.r.deviance(model)) vals = dict(zip(model.names, list(model))) df = vals['df.residual'][0] return y_pred, [deviance, df]
def my_evaluate(individual): dataFrame['label'] = individual robjects.globalenv['dataFrame'] = dataFrame fmla = Formula('label ~ .') ## -- linearity linearityVector = stringr_c.linearity_formula(fmla, dataFrame, measures="L2", summary="return") linearity = linearityVector.rx(1) fitness = abs(globalLinear - linearity[0][0]) ## -- neighborhood N1 n1Vector = stringr_c.neighborhood_formula(fmla, dataFrame, measures="N1", summary="return") f1 = n1Vector.rx(1) fitness2 = abs(globalN1 - f1[0][0]) ## -- neighborhood N2 n2Vector = stringr_c.neighborhood_formula(fmla, dataFrame, measures="N2", summary="return") n2 = n2Vector.rx(1) fitness3 = abs(globalN2 - n2[0][0]) ##imbalance imbalanceVector = stringr_c.balance_formula(fmla, dataFrame, measures="C2", summary="return") imbalance = imbalanceVector.rx(1) fitness4 = abs(globalBalance - imbalance[0][0]) #print("imbalance: " + str(imbalance[0][0]) + " linearity: " + str(linearity[0][0]) + " N1: " + str( # f1[0][0]) + " N2: " + str(n2[0][0])) ## -- return (fitness4), (fitness), (fitness2), (fitness3),
def __init__(self, df, design): """ :param df: A data frame formed by merging files, or a list of files. :param design: Number of samples in the first treatment group. treatment sampleA1 A sampleA2 A sampleB1 B sampleB2 B """ if type(df) == pd.core.frame.DataFrame: self.df = df.copy() elif type(df) == list: self.df = Df(df, 'Count').df self.design = design self.design_formula = Formula('~ treatment') self.design_matrix = None self.dds = None self.normalized_count_matrix = None self.result = None self.design_design_matrix() self.run_deseq2() self.get_result()
def generate_slices(self, qn, response, vars=[], filt={}): # create the overall filter filt_fmla = u.fmla_for_filt(filt) # subset the rdf as necessary subs = subset_des_wexpr(self.rdf, filt_fmla) if len(filt) > 0 else self.rdf # create a formula for generating the cross-tabs/breakouts across # the selected vars lvl_f = Formula('~%s' % ' + '.join(vars)) if len(vars) > 0 else None # generate the crosstab/breakouts for the selected vars, # turn them into R selector expressions and concatenate # each non-empty selector with the R selector for the outer filter calls = thread_first( rstats.xtabs(lvl_f, subs), rbase.as_data_frame, pandas2ri.ri2py, (pd.DataFrame.query, "Freq > 0"), (pd.DataFrame.get, vars), lambda df: df.apply(lambda z: thread_last( z.to_dict(), lambda y: [(v, y[v]) for v in vars], list, lambda x: [tuple(x[:i + 1]) for i in range(len(x))], ), axis=1), (pd.DataFrame.to_records, False), list, concat, set, map(dict), list) if len(vars) > 0 else [] # setup the formula based on the qn and response # add the base case with empty slice filter # and dicts of qn/resp fmla, slice selector fmla, filt fmla res = [{ 'q': qn, 'r': response, 'f': filt, 's': s } for s in [{}, *calls]] return res
def gam_predict(location_csv, prediction_file, num_arrived, k_value): if not rpy2_loaded: from rpy2.robjects import Formula from rpy2.robjects.packages import importr base = importr('base') utils = importr('utils') mgcv = importr('mgcv') location_filename = location_csv #os.path.basename(location_csv) prediction_filename = prediction_file #os.path.basename(prediction_file) # Setup #base.setwd(os.path.dirname(location_csv)) loc = utils.read_csv(location_filename, header=False, nrows=num_arrived) pred = utils.read_csv(prediction_filename, header=False, nrows=num_arrived) pop = base.cbind(pred, loc) pop.colnames = ["shelter","x","y"] # GAM formula = Formula('shelter~s(x,y,k={})'.format(k_value)) m = mgcv.gam(formula, family="binomial", method="REML", data=pop) # Predict for everyone loc = utils.read_csv(location_filename, header=False) pred = utils.read_csv(prediction_filename, header=False) newd = base.cbind(pred, loc) newd.colnames = ["shelter","x","y"] result = mgcv.predict_gam(m, newd, type="response", se_fit=False) return list(result)
def ddx(self, contrasts=None, formula=None): if contrasts is None: contrasts = self.contrasts if formula is None: formula = "~" + "+".join(self.contrasts) df = self.data["Transcriptome Profiling"]['counts'].astype(int) design = self.metadata[contrasts].reindex(df.columns).reset_index() formula = Formula(formula) DEG = pandas2ri.ri2py_dataframe( DE_Genes(counts_df=pandas2ri.py2ri(df), design_matrix=pandas2ri.py2ri(design), design_formula=formula)).set_index("gene") # # Characteristic Direction (Multivariate statistical method) # # 0 excluded, 1 is control, 2 is perturbation # classes = self.metadata[contrasts] # # Calculate differential expression / methylation # sig_features = geode.chdir(data = self.dataframe.values, # sampleclass = classes, # genes = self.dataframe.index, # gamma = 1., # smooths covariance and reduces noise # sort = True, # calculate_sig = True, # nnull = 100, # sig_only = True, # norm_vector = False) return DEG #, pd.DataFrame(sig_features)
def __init__(self, count_matrix, design_matrix, design_formula, gene_column='id'): try: assert gene_column in count_matrix.columns, 'Wrong gene id column name' gene_id = count_matrix[gene_column] except AttributeError: sys.exit('Wrong Pandas dataframe?') self.dds = None self.result = None self.deseq_result = None self.resLFC = None self.comparison = None self.normalized_count_df = None self.gene_column = gene_column self.gene_id = count_matrix[self.gene_column] self.samplenames = count_matrix.columns[ count_matrix.columns != self.gene_column] with localconverter(robjects.default_converter + pandas2ri.converter): self.count_matrix = robjects.conversion.py2rpy( count_matrix.set_index(self.gene_column)) self.design_matrix = robjects.conversion.py2rpy(design_matrix) self.design_formula = Formula(design_formula) self.dds = deseq.DESeqDataSetFromMatrix(countData=self.count_matrix, colData=self.design_matrix, design=self.design_formula)
def deseq2_basic(data_frame, numerator=2, denominator=1, category_field='Category', sample_field='Sample', batch_field=None, expression_name_field='Name', counts_field='NumReads'): # from a dataframe # https://stackoverflow.com/questions/41821100/running-deseq2-through-rpy2 design = '~ `' + category_field + '`' if batch_field is not None: design = '~ `' + batch_field + '` + `' + category_field + '`' #print(design) design = Formula(design) mat = data_frame.pivot(columns=sample_field, index=expression_name_field, values=counts_field) mfields = [sample_field, category_field] if batch_field is not None: mfields += [batch_field] meta = data_frame[mfields].groupby(sample_field).first().loc[mat.columns] metaarr = {} metaarr[category_field] = robjects.IntVector(meta[category_field].apply( lambda x: _trans(x, numerator, denominator))) if batch_field is not None: metaarr[batch_field] = robjects.IntVector(meta[batch_field]) dds0 = deseq.DESeqDataSetFromMatrix(countData=mat.astype(int), colData=robjects.DataFrame(metaarr), design=design) dds1 = deseq.DESeq(dds0) res = rpy2.robjects.pandas2ri.ri2py(as_df(deseq.results(dds1))) res.index = mat.index res.index.name = expression_name_field return (dds0, dds1, res, mat, meta)
def fit_glmgp(y, coldata, design="~ log10_umi"): import rpy2 import rpy2.robjects as ro import rpy2.robjects.numpy2ri from rpy2.robjects import Formula from rpy2.robjects import IntVector from rpy2.robjects import pandas2ri from rpy2.robjects import r from rpy2.robjects.packages import importr pandas2ri.activate() rpy2.robjects.numpy2ri.activate() glmgp = importr("glmGamPoi") y_ro = np.asmatrix(y) # design_matrix_ro = np.asarray(design_matrix) fit = glmgp.glm_gp(data=y_ro, design=Formula(design), col_data=coldata, size_factors=False) overdispersions = fit[fit.names.index("overdispersions")] mu = fit[fit.names.index("Mu")] beta = fit[fit.names.index("Beta")][0] return { "theta": np.vstack((1 / overdispersions[0], np.mean(mu, axis=1) / 1e-4)).min(axis=0)[0], "Intercept": beta[0], "log10_umi": beta[1], }
def __init__(self, data=None, name='regression', formula=None, var_transform=False, *args, **kwargs): super().__init__(data=data, name=name, *args, **kwargs) # 创建REnv实例 self._renv = REnv() # 创建公式 self._formula = formula # 原始公式 self._origin_formula = self._formula self._copy_data = deepcopy(data) # 转换变量,特别是那些变量是中文的 self._var_transform = var_transform if self._var_transform: self._variables_mapping = [(self._copy_data.columns[i],'_'.join(['var',str(i)])) for i in range(len(self._copy_data.columns))] self._variables_mapping_dict = dict(self._variables_mapping) self._variables_mapping_dict_reversed = dict([('_'.join(['var',str(i)]),self._copy_data.columns[i]) for i in range(len(self._copy_data.columns))]) self._generated_variables = [item[1] for item in self._variables_mapping] self._copy_data.columns = self._generated_variables for key in self._variables_mapping_dict: self._formula = re.sub(key,self._variables_mapping_dict[key],self._formula) self._formula = Formula(self._formula) self._lm = importr('stats').lm self._summary = importr('base').summary
def run_deseq(self, **kwargs): """ actually running deseq2 Args: **kwargs: Any keyword arguments for DESeq From DESeq2 manual: DESeq( object, test = c("Wald", "LRT"), fitType = c("parametric", "local", "mean", "glmGamPoi"), sfType = c("ratio", "poscounts", "iterate"), betaPrior, full = design(object), reduced, quiet = FALSE, minReplicatesForReplace = 7, modelMatrixType, useT = FALSE, minmu = if (fitType == "glmGamPoi") 1e-06 else 0.5, parallel = FALSE, BPPARAM = bpparam() ) """ for key, value in kwargs.items(): if key == 'reduced': kwargs[key] = Formula(value) self.dds = deseq.DESeq(self.dds, **kwargs) self.comparison = list(deseq.resultsNames(self.dds))
def __init__(self, count_matrix, design_matrix, design_formula, gene_column='gene_id'): print("you need to have R installed with the DESeq2 library installed") try: assert gene_column == count_matrix.columns[ 0], 'no $gene_column name in 1st column\'s name' gene_id = count_matrix[gene_column] except AttributeError: sys.exit('Wrong Pandas dataframe?') print(rpy2.__version__) self.deseq_result = None self.resLFC = None self.comparison = None self.normalized_count_matrix = None self.gene_column = gene_column self.gene_id = count_matrix[self.gene_column] with localconverter(ro.default_converter + pandas2ri.converter): self.count_matrix = pandas2ri.py2rpy( count_matrix.drop(gene_column, axis=1).astype(int)) self.design_matrix = pandas2ri.py2rpy(design_matrix.astype(bool)) self.design_formula = Formula(design_formula) self.dds = deseq.DESeqDataSetFromMatrix(countData=self.count_matrix, colData=self.design_matrix, design=self.design_formula)
def __init__(self, arch_lags, garch_lags, ar_lags=0, ma_lags=0): """ """ # import R packages self.r_base = importr("base") self.r_fGarch = importr("fGarch") self.r_stats = importr("stats") # model specs self.arch_lags = arch_lags self.garch_lags = garch_lags self.ar_lags = ar_lags self.ma_lags = ma_lags # assign coefficient names: omega, ar1,...,ma1,...,alpha1,...,beta1,... self.ar_names = ["ar" + str(n + 1) for n in range(ar_lags)] self.ma_names = ["ma" + str(n + 1) for n in range(ma_lags)] self.arch_names = ["alpha" + str(n + 1) for n in range(arch_lags)] self.garch_names = ["beta" + str(n + 1) for n in range(garch_lags)] # R formula of the equation self.formula = Formula( "~arma({ar:1d},{ma:1d})+garch({p:1d},{q:1d})".format( ar=self.ar_lags, ma=self.ma_lags, p=self.arch_lags, q=self.garch_lags))
def my_evaluate(individual): vetor = [] dataFrame['label'] = individual robjects.globalenv['dataFrame'] = dataFrame fmla = Formula('label ~ .') if ("1" in metricasList): ##imbalance imbalanceVector = stringr_c.balance_formula(fmla, dataFrame, measures="C2", summary="return") imbalance = imbalanceVector.rx(1) vetor.append(abs(globalBalance - imbalance[0][0])) if ("2" in metricasList): ## -- linearity linearityVector = stringr_c.linearity_formula(fmla, dataFrame, measures="L2", summary="return") linearity = linearityVector.rx(1) vetor.append(abs(globalLinear - linearity[0][0])) if ("3" in metricasList): ## -- neighborhood N2 n2Vector = stringr_c.neighborhood_formula(fmla, dataFrame, measures="N1", summary="return") n2 = n2Vector.rx(1) vetor.append(abs(globalN1 - n2[0][0])) if ("4" in metricasList): ## -- Network ClsCoef ClsCoefVector = stringr_c.network_formula(fmla, dataFrame, measures="ClsCoef", summary="return") ClsCoef = ClsCoefVector.rx(1) vetor.append(abs(globalClsCoef - ClsCoef[0][0])) if ("5" in metricasList): ## -- Dimensionality T2 t2Vector = stringr_c.dimensionality_formula(fmla, dataFrame, measures="T2", summary="return") t2 = t2Vector.rx(1) vetor.append(abs(globalt2 - t2[0])) if ("6" in metricasList): ## -- Feature-based F1 f1Vector = stringr_c.overlapping_formula(fmla, dataFrame, measures="F1", summary="return") f1 = f1Vector.rx(1) vetor.append(abs(globalf1 - f1[0][0])) ## -- if (len(vetor) == 2): return vetor[0], vetor[1], elif (len(vetor) == 3): return vetor[0], vetor[1], vetor[2], elif (len(vetor) == 4): return vetor[0], vetor[1], vetor[2], vetor[3],
def spectra_difference(counts_table, group_label, test=False): """group_label is the column name for category""" # we compare direction between group columns = ['count', 'direction', group_label] assert set(columns) <= set(counts_table.header) formula = "count ~ direction + %s" % group_label null = Formula(formula) if test: print(formula) counts_table = counts_table.get_columns(columns) d = as_dataframe(counts_table) f = R.glm(null, data=d, family="poisson") f_attr = dict(list(f.items())) dev = f_attr['deviance'][0] df = f_attr['df.residual'][0] collated = convert_rdf_to_pandasdf(f_attr['data']) collated['fitted'] = list(f_attr['fitted.values']) dev_to_re = DevianceToRelativeEntropy(collated['count'].sum()) calc_ret = CalcRet(dev_to_re) total_re = dev_to_re(dev) collated['ret'] = calc_ret(collated['count'], collated['fitted']) collated = collated.reindex(columns + ['fitted', 'ret'], axis=1) collated = collated.sort_values(by=columns[:-1]) return total_re, dev, df, collated, formula
def __init__(self, count_matrix, design_matrix, design_formula, feature_column='id', var_column='condition', exons=None, genes=None, threads=1): try: assert feature_column in count_matrix.columns, 'Wrong gene id column name' assert var_column in design_matrix.columns, 'Wrong var column for DEXSeq' except AttributeError: sys.exit('Wrong Pandas dataframe?') self.dxd = None self.dxd_res = None self.dexseq_result = None self.comparison = None self.normalized_count_matrix = None self.feature_column = feature_column self.exons = exons self.genes = genes self.gene_id = count_matrix[self.feature_column] self.count_matrix = pandas2ri.py2ri( count_matrix.drop(feature_column, axis=1)) self.design_matrix = pandas2ri.py2ri(design_matrix) self.design_formula = Formula(design_formula) self.BPPARAM = bp.MulticoreParam(workers=threads) self.var_column = var_column
r_analytical_set = pandas2ri.py2ri(analytical_set) print r_analytical_set print type(r_analytical_set) # get summary print R.table(r_analytical_set.rx('pass')) R('p <- 846/5062') R('odds <- p/(1 - p)') R('logit <- log(p/(1 - p))') R('invlogit <- function(x){ exp(x)/(1 + exp(x)) }') R('invlogit(logit)') #formula = 'pass~n' from rpy2.robjects import Formula formula = Formula('pass~n') formula.getenvironment()['pass'] = r_analytical_set.rx2('pass') formula.getenvironment()['n'] = r_analytical_set.rx2('n') #fit = R.glm(formula=formula, data=r_analytical_set, family=R('binomial(link="logit")')) import rpy2.robjects.packages as rpacks stats = rpacks.importr("stats") fit = stats.glm(formula = formula, family = stats.binomial(link = "logit"), data=r_analytical_set) s = R.summary(fit) print(fit) print(R.summary(fit)) R.plot(formula,
def getSimpleFormula(x, y): formula = Formula("y ~ x") formula.environment["x"] = x formula.environment["y"] = y return formula
xyplot = lattice.xyplot #-- setupxyplot-end #-- dataset-begin rnorm = stats.rnorm dataf_rnorm = robjects.DataFrame({'value': rnorm(300, mean=0) + rnorm(100, mean=3), 'other_value': rnorm(300, mean=0) + rnorm(100, mean=3), 'mean': IntVector([0, ]*300 + [3, ] * 100)}) #-- dataset-end grdevices.png('../../_static/graphics_lattice_xyplot_1.png', width = 612, height = 612, antialias="subpixel", type="cairo") #-- xyplot1-begin datasets = importr('datasets') mtcars = datasets.mtcars formula = Formula('mpg ~ wt') formula.getenvironment()['mpg'] = mtcars.rx2('mpg') formula.getenvironment()['wt'] = mtcars.rx2('wt') p = lattice.xyplot(formula) rprint(p) #-- xyplot1-end grdevices.dev_off() grdevices.png('../../_static/graphics_lattice_xyplot_2.png', width = 612, height = 612, antialias="subpixel", type="cairo") #-- xyplot2-begin p = lattice.xyplot(formula, groups = mtcars.rx2('cyl')) rprint(p) #-- xyplot2-end grdevices.dev_off()