def rpy2py_vector(v): """ Converts vectors. Also handles NA in int vectors: https://github.com/rpy2/rpy2/issues/376 """ if not isinstance(v, Sexp): return v if isinstance(v, IntSexpVector): assert v._R_SIZEOF_ELT == 4, "R integer size changed away from 32 bit" if "factor" in v.rclass: r = pandas2ri.rpy2py(v) else: r = pd.array(v, dtype=pd.Int32Dtype()) r[np.array(baseenv["is.na"](v), dtype=bool)] = pd.NA return r return pandas2ri.rpy2py(v)
def data_frame_to_string(r_data_frame, add_rownames=False) -> str: """ Convert an R data.frame to a string representation using rpy2. :param r_data_frame: The R data.frame object :param add_rownames: If set, rownames are added to the output :returns: The string representation in tab-delimited format. """ # convert the R data.frame to numpy data_frame = pandas2ri.rpy2py(r_data_frame) # initialise the list of rows with the header and an optional empty first field if add_rownames: all_lines = ["\t" + "\t".join(list(data_frame.columns))] else: all_lines = ["\t".join(list(data_frame.columns))] # add each row for row in data_frame.iterrows(): if add_rownames: this_line = [row[0]] + list(row[1]) else: this_line = list(row[1]) # convert to single string all_lines.append("\t".join([str(value) for value in this_line])) # join the lines complete_string = "\n".join(all_lines) + "\n" return complete_string
def pcor(var1, var2, covariate, method='spearman'): '''Run R ppcor's partial correlation Key arguments: var1, var2, covariate: float or int numpy array method: str, 'spearman' or 'pearson' ''' # import ppcor library in R base = importr('ppcor') # define variables in R x = FloatVector(var1) y = FloatVector(var2) c = FloatVector(covariate) # assign values r.assign('x', x) r.assign('y', y) r.assign('c', c) # run partial correlation in R and return outputs to python r(f'pcorOut <- pcor.test(x, y, c, method = "{method}")') pcor_out = r('pcorOut') pcor_out_df = pandas2ri.rpy2py(pcor_out) return pcor_out_df
def limma_camera(matrix, design, weights, limma_stats, groups, coef, group_name='group'): assert set(matrix.index) == set(limma_stats.index) limma_group_stats = {} limma_empirical_stats = {} for group, index in groups.items(): limma_subdf = limma_stats.loc[index] df_stats = limma_subdf.mean() df_stats['proteins'] = '/'.join(sorted(index)) __, \ empirical_median, empirical_median_left, empirical_median_right = empirical_ci(limma_subdf, random_state=RANDOM_STATE) row = pd.Series( [empirical_median, empirical_median_left, empirical_median_right], index=[ 'empirical_median', 'empirical_median_ci_left', 'empirical_median_ci_right' ]) limma_group_stats[group] = df_stats limma_empirical_stats[group] = row limma_group_stats = pd.DataFrame(limma_group_stats).T limma_group_stats.index.name = group_name limma_group_stats.columns = [ f'mean_{c}' for c in limma_group_stats.columns ] limma_empirical_stats = pd.DataFrame(limma_empirical_stats).T limma_empirical_stats.index.name = group_name r_groups = _to_r_listvector_of_string(groups) r_matrix, r_design, r_weights = to_r_matrix_design_and_weights( matrix, design, weights) r_camera_res = r_limma.camera(r_matrix, contrast=coef, index=r_groups, design=r_design, weights=r_as_matrix(r_weights), **{'use.ranks': False}) camera_res = pandas2ri.rpy2py(r_camera_res) camera_res.index.name = group_name camera_res = camera_res.join(limma_group_stats).join(limma_empirical_stats) return camera_res
def calc_knn_sale(user, password, host, database, port): r = robjects.r r['source']('for_sale_KNN_dynamic_script.R') #object of R file try: get_main_function_r = robjects.globalenv[ 'main_forsale_knn'] #loading R function to use df_result_r = get_main_function_r(user, password, host, database, port) df_result = pandas2ri.rpy2py(df_result_r) except Exception as e: print("KNN Sale Building model failed:", e) return None return df_result
def calc_neuralnet_census(user, password, host, database, port): r = robjects.r r['source']('nn_census_script.R') #object of R file try: get_main_function_r = robjects.globalenv[ 'mainfunction.all'] #loading R function to use df_result_r = get_main_function_r(host, user, password, database, port) df_result = pandas2ri.rpy2py(df_result_r) except Exception as e: print("NeuralNet Census model failed:", e) return None return df_result
def _varimax(self, factor_df, **kwargs): ''' varimax rotation of factor matrix Args: factor_df: factor matrix as pd.DataFrame with shape (# features, # principal components) Return: rot_factor_df: rotated factor matrix as pd.DataFrame ''' factor_mtr = self._df2mtr(factor_df) varimax = robjects.r['varimax'] rot_factor_mtr = varimax(factor_mtr) return pandas2ri.rpy2py(rot_factor_mtr.rx2('loadings'))
def load_data(filename): """ Loads data for the models in RDS format from bnlearn.com Parameters ---------- filename : RDS file path, e.g. './asia.rds' Returns ------- data : RDS data object """ readRDS = robjects.r['readRDS'] data = readRDS(filename) data = pandas2ri.rpy2py(data) return data
def fit_lme(formula, df, family='gaussian', optimizer='nloptwrap', random_effect=True, **fit_kwargs): f = Formula(formula) lme4 = importr('lme4') lmer = importr('lmerTest') # overloads lmer function from lme4 package base = importr('base') stats = importr('stats') with localconverter(ro.default_converter + pandas2ri.converter): if family == 'gaussian': if random_effect: control = lme4.lmerControl( **{ 'calc.derivs': True, 'check.rankX': 'silent.drop.cols', 'check.conv.singular': r('lme4::.makeCC')(action="ignore", tol=1e-4) }) fit = lmer.lmer(f, df, control=control, **fit_kwargs) else: fit = stats.lm(f, df, **fit_kwargs) elif family in ('binomial', 'poisson'): if random_effect: if optimizer == 'nloptwrap': control = lme4.glmerControl( **{ 'optimizer': 'nloptwrap', 'calc.derivs': True, 'check.rankX': 'silent.drop.cols', 'check.conv.singular': r('lme4::.makeCC')(action="ignore", tol=1e-4) }) else: control = lme4.glmerControl( **{ 'check.rankX': 'silent.drop.cols', 'check.conv.singular': r('lme4::.makeCC')(action="ignore", tol=1e-4) }) fit = lme4.glmer(f, df, control=control, family=family, **fit_kwargs) else: fit = stats.glm(f, df, family=family, **fit_kwargs) else: if random_effect: if optimizer == 'nloptwrap': control = lme4.glmerControl( **{ 'optimizer': 'nloptwrap', 'calc.derivs': True, 'check.rankX': 'silent.drop.cols', 'check.conv.singular': r('lme4::.makeCC')(action="ignore", tol=1e-4) }) fit = r('lme4::glmer.nb')(f, df, **{ 'nb.control': control }, **fit_kwargs) else: fit = r('lme4::glmer.nb')(f, df, **fit_kwargs) else: fit = r('MASS::glm.nb')(f, df, **fit_kwargs) anova_df = stats.anova(fit) coef_df = r['as.data.frame'](stats.coef(base.summary(fit))) coef_df = pandas2ri.rpy2py(coef_df) return coef_df, anova_df
def call_fitter( site_inputs_training, y_training, site_inputs_validation, hprm, ): assert y_training.ndim == 1 path_R_files = os.path.join( paths.outputs, 'R_files/', ) os.makedirs( path_R_files, exist_ok=True, ) ### Data data_training = { **{ simplify_inpt_name(inpt, trsfm, prm, location): site_inputs_training[inpt, trsfm, prm, location].values for inpt, trsfm, prm, location in site_inputs_training }, 'target': y_training.values, } data_validation = { simplify_inpt_name(inpt, trsfm, prm, location): site_inputs_validation[inpt, trsfm, prm, location].values for inpt, trsfm, prm, location in site_inputs_validation } # Convert arrays pandas2ri.activate() df_train = pandas2ri.py2rpy(pd.DataFrame.from_dict(data_training)) df_test = pandas2ri.py2rpy(pd.DataFrame.from_dict(data_validation)) pandas2ri.deactivate() # Save converted files r.assign("data_train", df_train) r("save(data_train, file='{0}/temp_dat_for_r_train.gzip', compress=TRUE)". format(path_R_files)) r.assign("data_test", df_test) r("save(data_test, file='{0}/temp_dat_for_r_test.gzip', compress=TRUE)". format(path_R_files)) nb_unique = {k: len(np.unique(v)) for k, v in site_inputs_training.items()} string_formula = make_gam_formula( site_inputs_training.columns, nb_unique, hprm, ) ### Launch the R script path2script = os.path.join( os.path.dirname(__file__), 'load_fit_predict_savePredictions.R', ) args = [string_formula, path_R_files] cmd = ['Rscript', path2script] + args # Python will quote what must be quoted in subprocess.check_output print('launch Rscript') x = subprocess.check_output(cmd, universal_newlines=True) print(x) y_hat_training = r['read.table']( "{0}/predictions_from_r_train.gzip".format(path_R_files)) y_hat_training = pandas2ri.rpy2py(y_hat_training) y_hat_training = y_hat_training.values y_hat_validation = r['read.table']( "{0}/predictions_from_r_test.gzip".format(path_R_files)) y_hat_validation = pandas2ri.rpy2py(y_hat_validation) y_hat_validation = y_hat_validation.values return y_hat_training, y_hat_validation
import rpy2.robjects.pandas2ri as pandas2ri pw_dmr_calls = pd.DataFrame( {"rds_path": dmr_calls_dir + "/" + pd.Series(os.listdir(dmr_calls_dir))}) pw_dmr_calls["pop"] = pw_dmr_calls["rds_path"].str.extract( r".*_dmrs_hsc_vs_([\w-]+)_0.01", expand=False) pw_dmr_calls gain_loss_counts = pd.DataFrame(-1, columns=["Gain", "Loss"], index=pw_dmr_calls["pop"]) for _unused, row_ser in pw_dmr_calls.iterrows(): # chr start end length nCG meanMethy1 meanMethy2 diff.Methy areaStat gain_loss_for_pop = (np.sign( pandas2ri.rpy2py(base.readRDS(row_ser["rds_path"])).eval( "meanMethy2 - meanMethy1")).value_counts().sort_index().set_axis( ["Loss", "Gain"])) gain_loss_counts.loc[row_ser["pop"]] = gain_loss_for_pop gain_loss_counts = gain_loss_counts.sort_values("Loss") gain_loss_counts.head() pw_counts_plot_df = (gain_loss_counts.stack().reset_index().set_axis( ["Population", "Direction", "No. of DMRs"], axis=1)) pw_counts_plot_df fig, ax = plt.subplots(1, 1, dpi=300, constrained_layout=True, figsize=(4 / 2.54, 3 / 2.54))
def to_dataframe(x): return pandas2ri.rpy2py(r_as_dataframe(x))