def __init__(self, count_matrix, design_matrix, design_formula, gene_column='gene_id'): print("you need to have R installed with the DESeq2 library installed") try: assert gene_column == count_matrix.columns[ 0], 'no $gene_column name in 1st column\'s name' gene_id = count_matrix[gene_column] except AttributeError: sys.exit('Wrong Pandas dataframe?') print(rpy2.__version__) self.deseq_result = None self.resLFC = None self.comparison = None self.normalized_count_matrix = None self.gene_column = gene_column self.gene_id = count_matrix[self.gene_column] with localconverter(ro.default_converter + pandas2ri.converter): self.count_matrix = pandas2ri.py2rpy( count_matrix.drop(gene_column, axis=1).astype(int)) self.design_matrix = pandas2ri.py2rpy(design_matrix.astype(bool)) self.design_formula = Formula(design_formula) self.dds = deseq.DESeqDataSetFromMatrix(countData=self.count_matrix, colData=self.design_matrix, design=self.design_formula)
def py2rpy_anndata(obj: AnnData) -> RS4: with localconverter(default_converter): s4v = importr("S4Vectors") sce = importr("SingleCellExperiment") # TODO: sparse x = {} if obj.X is None else dict(X=mat_converter.py2rpy(obj.X.T)) layers = {k: mat_converter.py2rpy(v.T) for k, v in obj.layers.items()} assays = ListVector({**x, **layers}) row_args = {k: pandas2ri.py2rpy(v) for k, v in obj.var.items()} if check_no_dupes(obj.var_names, "var_names"): row_args["row.names"] = pandas2ri.py2rpy(obj.var_names) row_data = s4v.DataFrame(**row_args) col_args = {k: pandas2ri.py2rpy(v) for k, v in obj.obs.items()} if check_no_dupes(obj.obs_names, "obs_names"): col_args["row.names"] = pandas2ri.py2rpy(obj.obs_names) col_data = s4v.DataFrame(**col_args) # Convert everything we know with localconverter(full_converter() + dict_converter): metadata = ListVector(obj.uns.items()) rd_args = {conv_name.scanpy2sce(k): mat_converter.py2rpy(obj.obsm[k]) for k in obj.obsm.keys()} reduced_dims = s4v.SimpleList(**rd_args) return sce.SingleCellExperiment( assays=assays, rowData=row_data, colData=col_data, metadata=metadata, reducedDims=reduced_dims )
def __init__(self, count_matrix, design_matrix, conditions, gene_column='id'): self.dds = None self.deseq_result = None self.resLFC = None self.comparison = None self.normalized_count_matrix = None self.gene_column = gene_column self.gene_id = count_matrix[self.gene_column] self.count_matrix = pandas2ri.py2rpy( count_matrix.drop(gene_column, axis=1)) design_formula = "~ " for col in conditions: levels = design_matrix[col].unique() levels = robjects._convert_rpy2py_strvector(levels) as_factor = r["as.factor"] design_matrix[col] = FactorVector(design_matrix[col], levels=levels) design_matrix[col] = as_factor(design_matrix[col]) design_formula = design_formula + col + " +" design_formula = design_formula[:-2] self.design_matrix = pandas2ri.py2rpy(design_matrix) self.design_formula = Formula(design_formula)
def _infer_network(self, data): """ Infer the network. Args: data (pd.DataFrame): data to be used for the inference. """ # activate implicit conversion from pandas to R objects pandas2ri.activate() genie3 = importr('GENIE3') importr('foreach') importr('doParallel') # transform pandas dataframe into GENIE3 input format # via first automatic conversion to data.frame from pd.DataFrame # to matrix with `as.matrix` to preserve colnames and rownames expr_matrix = as_matrix(pandas2ri.py2rpy(data.T)) # run GENIE3 values = genie3.GENIE3( expr_matrix, self.regulators, self.targets, self.tree_method, self.k, self.n_trees, self.n_cores, self.verbose ) weight_matrix = pd.DataFrame( values, columns=data.columns, index=data.columns ) self.graph = Graph(adjacency=weight_matrix) logger.debug('inferred with {}'.format(self.method))
def fit(self, dfx: pd.DataFrame, outcome_col, covariate_cols, teacher_id_col, **argv): covariate_cols_except_fixed = [ x for x in covariate_cols if x not in self.fixed_effect_cols ] fixed_effect_cols_plus_tid = [teacher_id_col] + self.fixed_effect_cols dropna_subset_cols = [outcome_col ] + covariate_cols + fixed_effect_cols_plus_tid formula = create_felm_formula(outcome_col, covariate_cols_except_fixed, fixed_effect_cols_plus_tid, self.factor_cols) pandas2ri.activate() df_use = dfx.dropna(subset=dropna_subset_cols) _res1 = self.r.assign("r_df", pandas2ri.py2rpy(df_use)) _res2 = self.r( "res <- lfe::felm({formula}, r_df)".format(formula=formula)) bb = self.r("lfe::getfe(res)") self.effect = bb self.residuals_without_fixed = pd.Series(index=dfx.index) self.residuals_without_fixed.loc[df_use.index, ] = self.r( "res$r.residuals")[:, 0] self.residuals_with_fixed = pd.Series(index=dfx.index) self.residuals_with_fixed.loc[df_use.index, ] = self.r( "res$residuals")[:, 0] pandas2ri.deactivate()
def fitdist(data: pd.Series, **kwargs): """fitdist See: https://cran.r-project.org/web/packages/fitdistrplus/fitdistrplus.pdf """ rdf = pandas2ri.py2rpy(data) return fitdistrplus.fitdist(rdf, **kwargs)
def __init__(self, count_matrix, design_matrix, design_formula): self.dds = None self.deseq_result = None self.resLFC = None self.comparison = None self.normalized_count_df = None # self.gene_column = self.count_matrix.index self.gene_id = count_matrix.index self.samplenames = count_matrix.columns self.count_matrix = pandas2ri.py2rpy(count_matrix) self.design_matrix = pandas2ri.py2rpy(design_matrix) self.design_formula = Formula(design_formula) self.dds = deseq.DESeqDataSetFromMatrix(countData=self.count_matrix, colData=self.design_matrix, design=self.design_formula)
def simpleNetworkx(G): ro.r('src = c()') ro.r('target =c()') ro.r('rdf=data.frame()') df = p.DataFrame(data=G.edges()) df_r = pandas2ri.py2rpy(df) ro.globalenv['src'] = df_r[0] ro.globalenv['target'] = df_r[1] ro.r('rdf=data.frame(src,target)') utils = importr('utils') utils.chooseCRANmirror(ind=1) try: networkD3 = importr('networkD3') except: utils.install_packages('networkD3') networkD3 = importr('networkD3') try: magrittr = importr('magrittr') except: utils.install_packages('magrittr') magrittr = importr('magrittr') ro.r('''simpleNetwork(rdf) %>% saveNetwork(file = 'Net.html')''') return None
def fit( self, x: Optional[np.ndarray] = None, y: Optional[np.ndarray] = None, w: Optional[np.ndarray] = None, **kwargs, ) -> "GamMGCVModel": """ Fit the model. Params ------ x Independent variables. y Dependent variables. w Weights of :paramref:`x`. kwargs Keyword arguments. Returns ------- :class:`cellrank.ul.models.GamMGCVModel` Return fitted self. """ from rpy2 import robjects from rpy2.robjects import pandas2ri, Formula from rpy2.robjects.packages import importr super().fit(x, y, w, **kwargs) use_ixs = np.where(self.w > 0)[0] self._x = self.x[use_ixs] self._y = self.y[use_ixs] self._w = self.w[use_ixs] n_splines = kwargs.pop("n_splines", self._n_splines) mgcv = importr("mgcv") pandas2ri.activate() df = pandas2ri.py2rpy( pd.DataFrame(np.c_[self.x, self.y][use_ixs, :], columns=["x", "y"])) self._model = mgcv.gam( Formula(f'y ~ s(x, k={n_splines}, bs="cr")'), data=df, sp=self._sp, family=robjects.r.gaussian, weights=pd.Series(self.w[use_ixs]), ) pandas2ri.deactivate() return self
def generate_args(n_args=256, max_rows=100, lang="py"): """This will create multiple dataframes (n_args) based on a template (TEMPLATE_PATH)""" args = [] df_template = pd.read_csv(TEMPLATE_PATH) for n in range(n_args): new_df = construct_df(df_template, max_rows) if lang == "r": new_df = pandas2ri.py2rpy(new_df) args.append(new_df) return args
def _gam_fit_predict(x, y, weights=None, pred_x=None): import rpy2.robjects as robjects from rpy2.robjects import pandas2ri, Formula from rpy2.robjects.packages import importr pandas2ri.activate() # Weights if weights is None: weights = np.repeat(1.0, len(x)) # Construct dataframe use_inds = np.where(weights > 0)[0] r_df = pandas2ri.py2rpy( pd.DataFrame(np.array([x, y]).T[use_inds, :], columns=["x", "y"])) # Fit the model rgam = importr("gam") model = rgam.gam(Formula("y~s(x)"), data=r_df, weights=pd.Series(weights[use_inds])) # Predictions if pred_x is None: pred_x = x y_pred = np.array( robjects.r.predict(model, newdata=pandas2ri.py2rpy( pd.DataFrame(pred_x, columns=["x"])))) # Standard deviations p = np.array( robjects.r.predict(model, newdata=pandas2ri.py2rpy( pd.DataFrame(x[use_inds], columns=["x"])))) n = len(use_inds) sigma = np.sqrt(((y[use_inds] - p)**2).sum() / (n - 2)) stds = (np.sqrt(1 + 1 / n + (pred_x - np.mean(x))**2 / ((x - np.mean(x))**2).sum()) * sigma / 2) return y_pred, stds
def art_2by2(df: pd.DataFrame, feature: str, group: str): feature_wide = df[["Unnamed: 0", feature, "all", "group"]] feature_long = feature_wide.melt(id_vars=["Unnamed: 0", "group"], value_vars=[feature, "all"]) feature_long = feature_long.query(f"group in ('wt', '{group}')") feature_long.loc[feature_long["value"] < 0, "value"] = 0 feature_long["group"] = feature_long["group"].astype("category") feature_long["variable"] = feature_long["variable"].astype("category") r_df = pandas2ri.py2rpy(feature_long) model = artool.art(robj.Formula("value ~ group * variable"), data=r_df) anova = robj.r["anova"] return anova(model)
def query_log_source(source, time_filter, time_column): cutoff = f"DATEADD(day, -{time_filter}, CURRENT_TIMESTAMP())" query = f"SELECT * FROM {source} WHERE {time_column} > {cutoff};" try: data = list(db.fetch(query)) except Exception as e: log.error("Failed to query log source: ", e) f = pack(data) frame = pandas.DataFrame(f) pandas2ri.activate() r_dataframe = pandas2ri.py2rpy(frame) return r_dataframe
def DESeq2(count_matrix, design_matrix, normalize, cores=1): # gene_column = '' to_dataframe = ro.r('function(x) data.frame(x)') count_matrix = round(count_matrix) count_matrix = pandas2ri.py2rpy(count_matrix) design_matrix = pandas2ri.py2rpy(design_matrix) design_formula = Formula(' ~ 1') dds0 = deseq.DESeqDataSetFromMatrix(countData=count_matrix, colData=design_matrix, design=design_formula) dds0 = BiocGenerics.estimateSizeFactors(dds0, type="poscounts") order_size_factor = list(dds0.do_slot('colData').do_slot('rownames')) if normalize is not None: logging.info("Enforcing custom normalisation in DESeq2") dds0.do_slot('colData').do_slot( 'listData')[1] = ro.vectors.FloatVector( list(normalize.loc[ order_size_factor, 'libsize_75percent'])) # Enforce size factors else: logging.info("WARNING: default size factor of DESeq2 are used") dds = deseq.DESeq( dds0, parallel=True, BPPARAM=BiocParallel.MulticoreParam(cores), sfType= "poscounts", # Will run 1. estimation of size factors: estimateSizeFactors # parameter "poscounts" fitType= "parametric" # 2. estimation of dispersion: estimateDispersions # parameter "parametric" ) deseq_result = deseq.results(dds) fit_res = to_dataframe(deseq_result) disp = to_dataframe(deseq.dispersions(dds)).rename({'x': 'dispersion'}, axis=1) disp.index = fit_res.index fit_res = pd.concat([fit_res['baseMean'], disp], axis=1) return fit_res
def main(): import pandas as pd from rpy2.robjects import pandas2ri import rpy2.robjects as ro from teacher_va.estimate import TeacherValueAddedEstimator, StudentDataFrame def give_group_name(dfx, keys, group_name_col='name'): aa = dfx[keys].drop_duplicates().dropna() aa[group_name_col] = 1 aa[group_name_col] = aa[group_name_col].cumsum() return (dfx.merge(aa, on=keys, how='left')) pd.set_option("display.max_columns", 101) df = ( pd.read_csv('data/math_teacher.csv').pipe( give_group_name, keys=[ 'year_prime', 'school_id_prime', 'grade_prime', 'class_prime' ], group_name_col='name')[[ 'mst_id', 'name', 'math_level_prime', 'math_level', 'teacher_id', 'year_prime' ]].dropna(subset=['math_level_prime', 'math_level', 'year_prime']) # .pipe(lambda dfx: pd.get_dummies(dfx, columns=['mst_id'], sparse=True, prefix='mstid')) ) # printできないような出力をコールするとしくるから要注意 pandas2ri.activate() r_df = ro.r.assign("r_df", pandas2ri.py2rpy(df)) aa = ro.r( "res <- lfe::felm(math_level ~ math_level_prime | as.factor(mst_id) |0 |0, r_df)" ) bb = ro.r("res$residuals") sdf = ( StudentDataFrame.get_student_dataframe( data=df, covariate_cols=['math_level_prime'], outcome_col='math_level', class_name_col='name', # 1 teacher: 1 class time_col='year_prime', teacher_id_col='teacher_id', )) sdf.fillna_teacher_id_from_class_cols() tvtva = TeacherValueAddedEstimator(effect_type='time_fixed') tvtva.fit(sdf=sdf, is_custom_predict=True, custom_resid=bb) teacher_effect = tvtva.teacher_effect tvtva = TeacherValueAddedEstimator(effect_type='time_varing') tvtva.fit(sdf=sdf, is_custom_predict=True, custom_resid=bb) teacher_effect2 = tvtva.teacher_effect """
def to_trajr(trj): """Convert trajectory to R `trajr` object. Default fps is 30. Args: trajectory (:class:`~traja.TrajaDataFrame`): trajectory Returns: traj (:class:`rpy2.robjects.vectors.DataFrame`): column names are ['x', 'y', 'time', 'displacementTime', 'polar', 'displacement'] .. doctest:: >>> import traja; from traja import rutils >>> df = traja.TrajaDataFrame({'x':range(5),'y':range(5)}) >>> trjr_df = rutils.to_trajr(df) # doctest: +SKIP >>> [x for x in trjr_df.names] # doctest: +SKIP ... ['x', 'y', 'id', 'time', 'displacementTime', 'polar', 'displacement'] """ from traja.trajectory import _get_time_col trajr = import_trajr() if "id" not in trj.__dict__.keys(): trj["id"] = 0 time_col = _get_time_col(trj) if time_col == "index": trj["time"] = trj.index time_col = "time" fps = trj.fps spatial_units = trj.spatial_units or "m" time_units = trj.time_units or "s" trj_rdf = rpandas.py2rpy(trj) trajr_trj = trajr.TrajFromCoords( trj_rdf, xCol="x", yCol="y", timeCol=time_col or rpy2.rinterface.NULL, fps=fps or 30, spatialUnits=spatial_units, timeUnits=time_units, ) return trajr_trj
def run(execute, globalenv=None, **kwargs): ## search inside analysis folder home = os.path.realpath(__file__) ## check if the command is a python file f = os.path.dirname(home) + '/' + execute + '.py' if os.path.isfile(f): execute = f if os.path.isfile(execute) and execute.endswith('.py'): module_spec = importlib.util.spec_from_file_location( 'plugin_module', execute) module = importlib.util.module_from_spec(module_spec) module_spec.loader.exec_module(module) return module.main(**kwargs) ## assume script is R if globalenv: rpy2.robjects.globalenv = globalenv for name, value in kwargs.items(): ## for debug conversion errors # print name # print type( value ) if isinstance(value, dict): ## use pandas value = pandas.DataFrame.from_dict(value) rpy2.robjects.globalenv[name] = pandas2ri.py2rpy(value) else: rpy2.robjects.globalenv[name] = converter.py2rpy(value) f = os.path.dirname(home) + '/' + execute + '.r' if os.path.isfile(f): execute = open(f).read() if os.path.isfile(execute): execute = open(execute).read() robjects.r(execute) return robjects.r ## return all computed things
def KM(OS, Censored, as_group, data, ggsave=False, path="./", pvalue=0): # 分组 surv_data = data[[OS, Censored, as_group]].sort_values(by=[as_group]) surv_data["group"] = "L" surv_data.iloc[int(surv_data.shape[0] / 2):]["group"] = "H" # Kaplan-Meier生存曲线 with localconverter(ro.default_converter + pandas2ri.converter): robjects.globalenv["surv_data"] = pandas2ri.py2rpy(surv_data) robjects.globalenv["surv_diff"] = r( f"survdiff(Surv({OS}, {Censored})~group,surv_data,rho = 0)") Pvalue = r("1 - pchisq(surv_diff$chisq, length(surv_diff$n) -1)")[0] if ggsave and Pvalue < pvalue: r.ggsave(r( f"autoplot(survfit(Surv({OS}, {Censored})~group,surv_data), xlab = 'Time', ylab = 'Survival')+ggtitle('Pvalue = {Pvalue}')" ), file=f"{path}/{gene}.pdf") return Pvalue
def estimation_fixed_effect(outcome_col, time_col, teacher_id_col, class_name_col, covariate_cols, fixed_effect_cols, **argv): def create_formula(target, covariate_cols, fixed_effect_cols): templete = '{target} ~ {covariate_str} | {fixed_str} | 0 | 0 ' covariate_str = get_add_str_from_str_list( covariate_cols) if len(covariate_cols) > 0 else ' 0 ' fixed_str = get_add_str_from_str_list( fixed_effect_cols) if len(fixed_effect_cols) > 0 else ' 0 ' return templete.format(target=target, covariate_str=covariate_str, fixed_str=fixed_str) use_cols = [outcome_col, time_col, teacher_id_col, class_name_col ] + covariate_cols + fixed_effect_cols dropna_subset_cols = [outcome_col, time_col, class_name_col ] + covariate_cols + fixed_effect_cols fixed_effect_cols_plus_tid = [teacher_id_col] + fixed_effect_cols formula = create_formula(outcome_col, covariate_cols, fixed_effect_cols_plus_tid) # start pd.set_option("display.max_columns", 101) df_res = ( pd.read_csv('./notebook/toda_teacher/df.csv') # 小学校だけで推定 .pipe(lambda dfx: dfx.loc[dfx['year_prime'] >= 2015]).pipe( lambda dfx: dfx.loc[dfx['school_id_prime'] < 30000]) [use_cols].dropna(subset=dropna_subset_cols) # .pipe(lambda dfx: pd.get_dummies(dfx, columns=['mst_id'], sparse=True, prefix='mstid')) ) pandas2ri.activate() _res1 = ro.r.assign("r_df", pandas2ri.py2rpy(df_res)) _res2 = ro.r("res <- lfe::felm({formula}, r_df)".format(formula=formula)) bb = ro.r("lfe::getfe(res)") pandas2ri.deactivate() effect = (bb.reset_index().pipe(lambda dfx: dfx.loc[dfx[ 'fe'] == teacher_id_col, ['index', 'effect']]).assign( **{ teacher_id_col: lambda dfx: dfx['index'].str.extract('{0}\.(.+)'.format( teacher_id_col)).astype(df_res[teacher_id_col].dtype) })[[teacher_id_col, 'effect']].rename(columns={'effect': 'tva'})) return effect
def list_to_vector(l): if isinstance(l, types.GeneratorType): l = list(l) if isinstance(l, map): l = list(l) if len(l) == 0: return rpy2.rinterface.NA_Real if isinstance(l[0], str): return rpy2.rinterface.StrSexpVector(l) if isinstance(l[0], int): return rpy2.rinterface.IntSexpVector(l) if isinstance(l[0], float): return rpy2.rinterface.FloatSexpVector(l) if isinstance(l[0], bool): return rpy2.rinterface.BoolSexpVector(l) if isinstance(l[0], dict): ## need to convert to data frame ## let's hope the keys are always the same for each of things in the list keys = l[0].keys() ## init new dict where values are collected dataframe = {} for key in keys: dataframe[key] = [] for row in l: for key in keys: if key not in row: value = None else: value = row[key] dataframe[key].append(value) dataframe = pandas.DataFrame.from_dict(dataframe) return pandas2ri.py2rpy(dataframe) ## default to NA just in case return rpy2.rinterface.NA_Real
def predict(self, x_test: Optional[np.ndarray] = None, key_added: str = "_x_test", **kwargs) -> np.ndarray: """ Run the prediction. Params ------ x_test Features used for prediction. key_added Attribute name where to save the independent variables. If `None`, don't save them. kwargs Keyword arguments. Returns ------- :class:`numpy.ndarray` The predicted values. """ from rpy2 import robjects from rpy2.robjects import pandas2ri if self.model is None: raise RuntimeError( "Trying to call an uninitialized model. To initialize it, run `.fit()` first." ) self._check(key_added, x_test) pandas2ri.activate() self._y_test = (np.array( robjects.r.predict( self.model, newdata=pandas2ri.py2rpy( pd.DataFrame(self.x_test, columns=["x"])), )).squeeze().astype(self._dtype)) pandas2ri.deactivate() return self.y_test
def linear_model(data, Input, Output, Condition): try: stats = importr('stats') base = importr('base') pandas2ri.activate() r_df = pandas2ri.py2rpy(data) pandas2ri.deactivate() formula = '{y}~{x}*{condition}'.format(y=Output, x=Input, condition=Condition) lm = stats.lm(formula, r_df) summary = (base.summary(lm)) results = summary.rx2('coefficients') results_df = base.as_data_frame_matrix(results) py_results_df = pd.DataFrame(results_df).transpose() py_results_df.columns = results_df.colnames py_results_df.index = results_df.rownames return (py_results_df) except: return (pd.DataFrame({}))
def dml_iivm_pyvsr_fixture(generate_data_iivm, idx, score, dml_procedure): boot_methods = ['normal'] n_folds = 2 # collect data data = generate_data_iivm[idx] X_cols = data.columns[data.columns.str.startswith('X')].tolist() # Set machine learning methods for m & gg learner_classif = LogisticRegression(penalty='none', solver='newton-cg') learner_reg = LinearRegression() ml_g = clone(learner_reg) ml_m = clone(learner_classif) ml_r = clone(learner_classif) obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols, 'z') dml_iivm_obj = dml.DoubleMLIIVM(obj_dml_data, ml_g, ml_m, ml_r, n_folds, dml_procedure=dml_procedure) np.random.seed(3141) dml_iivm_obj.fit() # fit the DML model in R all_train, all_test = export_smpl_split_to_r(dml_iivm_obj.smpls[0]) r_dataframe = pandas2ri.py2rpy(data) res_r = r_IIVM(r_dataframe, score, dml_procedure, all_train, all_test) res_dict = { 'coef_py': dml_iivm_obj.coef, 'coef_r': res_r[0], 'se_py': dml_iivm_obj.se, 'se_r': res_r[1] } return res_dict
def dml_irm_pyvsr_fixture(generate_data_irm, idx, score, dml_procedure): n_folds = 2 # collect data (X, y, d) = generate_data_irm[idx] x_cols = [f'X{i + 1}' for i in np.arange(X.shape[1])] data = pd.DataFrame(np.column_stack((X, y, d)), columns=x_cols + ['y', 'd']) # Set machine learning methods for m & g learner_classif = LogisticRegression(penalty='none', solver='newton-cg') learner_reg = LinearRegression() ml_g = clone(learner_reg) ml_m = clone(learner_classif) obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols) dml_irm_obj = dml.DoubleMLIRM(obj_dml_data, ml_g, ml_m, n_folds, score=score, dml_procedure=dml_procedure) np.random.seed(3141) dml_irm_obj.fit() # fit the DML model in R all_train, all_test = export_smpl_split_to_r(dml_irm_obj.smpls[0]) r_dataframe = pandas2ri.py2rpy(data) res_r = r_IRM(r_dataframe, score, dml_procedure, all_train, all_test) res_dict = { 'coef_py': dml_irm_obj.coef, 'coef_r': res_r[0], 'se_py': dml_irm_obj.se, 'se_r': res_r[1] } return res_dict
def gsea_metrics(source, target, file, meta_file): """ Calculates GSEA score for validation on CFM Args: source (str): Source cell type target (str): Target cell type file (str): Path to the file with TopoCMap results table meta_file (str): Path to the file with drugs metadata Returns: float :Normalized enrichment score """ drug_meta = pd.read_csv(meta_file) df = pd.read_csv(file) df = df.drop_duplicates() print(type(df)) cids_cur = stand_chems(source, target) pert_cur = [] for ind, chem in enumerate(drug_meta['pubchem_cid']): for chem_1 in cids_cur: try: if int(chem) == int(chem_1): pert_cur.append(drug_meta['pert_id'].loc[ind]) except ValueError: continue # Defining the R script and loading the instance in Python r = robjects.r r['source']('~/Downloads/fgsea-tutorial.R') # Loading the function we have defined in R. filter_country_function_r = robjects.globalenv['fgsea_analysis'] # converting it into r object for passring into r function df_r = pandas2ri.py2rpy(df) pert_cur_r = robjects.vectors.FactorVector(pert_cur) # Invoking the R function and getting the result df_result_r = filter_country_function_r(df_r, pert_cur_r) print(df_result_r) # Converting it back to a pandas dataframe. return df_result_r["NES"]
def auto_arima(endog, exog=None, freq=None): if freq is None: freq = 1 # endog_r = r.ts(pandas2ri.py2ri(endog), freq=freq) # if using more recent version of rpy2, py2ri was renamed to py2rpy # see reference: https://stackoverflow.com/questions/55990529/module-rpy2-robjects-pandas2ri-has-no-attribute-ri2py endog_r = r.ts(pandas2ri.py2rpy(endog), freq=freq) autoarima_args = { "seasonal": True, "stationary": False, "trace": True, "max.order": 20, "max.p": 20, "max.q": 20, "max.P": 20, "max.Q": 20, "max.D": 20, "max.d": 20, "start.p": 1, "start.q": 1, "start.P": 1, "start.Q": 1 } if exog is not None: # add noise to avoid rank-deficient error for exog scale = np.std(exog.values) z = scale * 1e-4 * np.random.randn(*exog.shape) exog_r = r.matrix(exog.values + z, nrow=exog.shape[0], ncol=exog.shape[1], dimnames=[[], exog.columns.tolist()]) fit_r = forecast.auto_arima(y=endog_r, xreg=exog_r, **autoarima_args) else: fit_r = forecast.auto_arima(y=endog_r, **autoarima_args) fit_dict = dict(fit_r.items()) # for proof of this order see last comment: # https://stats.stackexchange.com/questions/178577/how-to-read-p-d-and-q-of-auto-arima p, q, P, Q, s, d, D = list(fit_dict["arma"]) return (p, d, q), (P, D, Q, s)
def predict(self, x_test: Optional[np.ndarray] = None, key_added: str = "_x_test", **kwargs) -> np.ndarray: """ %(base_model_predict.full_desc)s Parameters ---------- %(base_model_predict.parameters)s Returns ------- %(base_model_predict.returns)s """ # noqa from rpy2 import robjects from rpy2.robjects import pandas2ri if self.model is None: raise RuntimeError( "Trying to call an uninitialized model. To initialize it, run `.fit()` first." ) if self._lib is None: raise RuntimeError( f"Unable to fit the model, R package `{self._lib_name!r}` is not imported." ) x_test = self._check(key_added, x_test) pandas2ri.activate() self._y_test = (np.array( robjects.r.predict( self.model, newdata=pandas2ri.py2rpy(pd.DataFrame(x_test, columns=["x"])), )).squeeze().astype(self._dtype)) pandas2ri.deactivate() return self.y_test
def dml_plr_pyvsr_fixture(generate_data1, idx, score, dml_procedure): n_folds = 2 n_rep_boot = 483 # collect data data = generate_data1[idx] X_cols = data.columns[data.columns.str.startswith('X')].tolist() # Set machine learning methods for m & g learner = LinearRegression() ml_g = clone(learner) ml_m = clone(learner) obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols) dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds, score=score, dml_procedure=dml_procedure) #np.random.seed(3141) dml_plr_obj.fit() # fit the DML model in R all_train, all_test = export_smpl_split_to_r(dml_plr_obj.smpls[0]) r_dataframe = pandas2ri.py2rpy(data) res_r = r_MLPLR(r_dataframe, score, dml_procedure, all_train, all_test) res_dict = { 'coef_py': dml_plr_obj.coef, 'coef_r': res_r[0], 'se_py': dml_plr_obj.se, 'se_r': res_r[1] } return res_dict
def fit( self, x: Optional[np.ndarray] = None, y: Optional[np.ndarray] = None, w: Optional[np.ndarray] = None, **kwargs, ) -> "GamMGCVModel": from rpy2 import robjects from rpy2.robjects import pandas2ri, Formula from rpy2.robjects.packages import importr super().fit(x, y, w, **kwargs) use_ixs = np.where(self.w > 0)[0] self._x = self.x[use_ixs] self._y = self.y[use_ixs] self._w = self.w[use_ixs] n_splines = kwargs.pop("n_splines", self._n_splines) mgcv = importr("mgcv") pandas2ri.activate() df = pandas2ri.py2rpy( pd.DataFrame(np.c_[self.x, self.y][use_ixs, :], columns=["x", "y"])) self._model = mgcv.gam( Formula(f'y ~ s(x, k={n_splines}, bs="cr")'), data=df, sp=self._sp, family=robjects.r.gaussian, weights=pd.Series(self.w[use_ixs]), ) pandas2ri.deactivate() return self
def predict(self, x_test: Optional[np.ndarray] = None, key_added: str = "_x_test", **kwargs) -> np.ndarray: from rpy2 import robjects from rpy2.robjects import pandas2ri if self.model is None: raise RuntimeError( f"Trying to call an uninitialized model. To initialize it, run `.fit()` first." ) self._check(key_added, x_test) pandas2ri.activate() self._y_test = (np.array( robjects.r.predict( self.model, newdata=pandas2ri.py2rpy( pd.DataFrame(self.x_test, columns=["x"])), )).squeeze().astype(self._dtype)) pandas2ri.deactivate() return self.y_test