def loadDataSet(self, rDataFilename): pandas2ri.activate() base = importr('base') base.load(rDataFilename) self.trainSet = pandas2ri.ri2py_dataframe(base.mget('train')[0]) self.testSet = pandas2ri.ri2py_dataframe(base.mget('test')[0]) self.dataColumns = [ x for x in list(self.trainSet.columns) if x[0] == 'i' ] self.classColumns = [ x for x in list(self.trainSet.columns) if x[0] == 'c' ]
def get_protein_traces_by_id(protein_ids, id_type): result = cached_run_secexploerer(protein_ids, id_type) if result is None or result[1] == NULL or result[1][1] == NULL: return pd.DataFrame(), [], [0, 0], {}, {} traces = pandas2ri.ri2py_dataframe(result[1][0][0]) traces = traces.set_index(["id"]) traces.index.name = "protein_id" mapping_table = pandas2ri.ri2py_dataframe(result[0][3]) if len(mapping_table.columns) == 3: mapping = dict(zip(mapping_table.iloc[:, 0], mapping_table.iloc[:, 2])) else: mapping = {} labels = [] for uniprot_id in traces.index: extra_label = mapping.get(uniprot_id) if extra_label is not None: label = "%s (%s)" % (extra_label, uniprot_id) label = extra_label else: label = uniprot_id labels.append(label) features = pandas2ri.ri2py_dataframe(result[1][1]) monomer_secs = {} monomer_intensities = {} for subunits, monomer_sec in zip(features.subunits_detected, features.monomer_sec): subunits = subunits.split(";") monomer_sec = monomer_sec.split(";") for (su, sec) in zip(subunits, monomer_sec): monomer_secs[su] = sec intensity = traces.loc[su, sec] monomer_intensities[su] = intensity new_subunits = [] for subunits in features.subunits_detected: subunits = subunits.split(";") subunits = [mapping.get(su, su) for su in subunits] new_subunits.append(";".join(subunits)) features["subunits_detected"] = new_subunits calibration_parameters = result[1][2] return traces, labels, calibration_parameters, monomer_secs, monomer_intensities
def getCorrelations(self, dataframe): """ Perform hierarchical clustering on a dataframe of expression values Arguments --------- dataframe: pandas.Core.DataFrame a dataframe containing gene IDs, sample IDs and gene expression values Returns ------- corr_frame: pandas.Core.DataFrame a dataframe of a pair-wise correlation matrix across samples. Uses the Pearson correlation. """ # set sample_id to index pivot = dataframe.pivot(index="sample_name", columns="transcript_id", values="TPM") transpose = pivot.T # why do I have to resort to R???? r_df = py2ri.py2ri_pandasdataframe(transpose) R.assign("p.df", r_df) R("""p.mat <- apply(p.df, 2, as.numeric)""") R("""cor.df <- cor(p.mat)""") r_cor = R["cor.df"] py_cor = py2ri.ri2py_dataframe(r_cor) corr_frame = py_cor return corr_frame
def apply_transferFunction_metric(r_stream1, r_stream2, evalresp1, evalresp2): """" Invoke a named "correlation" R metric and convert the R dataframe result into a Pandas dataframe. :param r_stream1: an r_stream object :param r_stream2: an r_stream object :param evalresp1: pandas DataFrame of evalresp FAP for r_stream1 :param evalresp2: pandas DataFrame of evalresp FAP for r_stream2 :return: """ R_function = robjects.r('IRISMustangMetrics::transferFunctionMetric') # NOTE: Conversion of dataframes only works if you activate but we don't want conversion # NOTE: to always be automatic so we deactivate() after we're done converting. pandas2ri.activate() r_evalresp1 = pandas2ri.py2ri_pandasdataframe(evalresp1) r_evalresp2 = pandas2ri.py2ri_pandasdataframe(evalresp2) pandas2ri.deactivate() # TODO: Can we just activate/deactivate before/after R_function() without converting # TODO: r_evalresp1/2 ahead of time? # Calculate the metric r_metriclist = R_function(r_stream1, r_stream2, r_evalresp1, r_evalresp2) r_dataframe = _R_metricList2DF(r_metriclist) pandas2ri.activate() df = pandas2ri.ri2py_dataframe(r_dataframe) pandas2ri.deactivate() # Convert columns from R POSIXct to pyton UTCDateTime df.starttime = df.starttime.apply(UTCDateTime) df.endtime = df.endtime.apply(UTCDateTime) return df
def main(args): importr('HMMcopy') titan = importr('TitanCNA') if args.target_bed_file is None: df = titan.correctReadDepth( args.tumour_wig_file, args.normal_wig_file, args.gc_wig_file, args.mappability_wig_file, ) else: target_df = pd.read_csv(args.target_bed_file, header=None, sep='\t') df = titan.correctReadDepth( args.tumour_wig_file, args.normal_wig_file, args.gc_wig_file, args.mappability_wig_file, targetedSequence=pandas2ri.py2ri(target_df)) df = pandas2ri.ri2py_dataframe(df) df.to_csv(args.out_file, index=False, sep='\t')
def getCorrelations(self, dataframe): ''' Perform hierarchical clustering on a dataframe of expression values Arguments --------- dataframe: pandas.Core.DataFrame a dataframe containing gene IDs, sample IDs and gene expression values Returns ------- corr_frame: pandas.Core.DataFrame a dataframe of a pair-wise correlation matrix across samples. Uses the Pearson correlation. ''' # set sample_id to index pivot = dataframe.pivot(index="sample_name", columns="transcript_id", values="TPM") transpose = pivot.T # why do I have to resort to R???? r_df = py2ri.py2ri_pandasdataframe(transpose) R.assign("p.df", r_df) R('''p.mat <- apply(p.df, 2, as.numeric)''') R('''cor.df <- cor(p.mat)''') r_cor = R["cor.df"] py_cor = py2ri.ri2py_dataframe(r_cor) corr_frame = py_cor return corr_frame
def extract_scholar_publications(persons): "Extract and return publication and citation information." # Import the scholar package scholar = importr("scholar") # Extract scholar publication information for each person, store # as standard Python dictionary publications = {} for (name, id) in persons.items(): print("Extracting publication information for %s" % name) # Get basic profile info pubs = scholar.get_publications(id) # Convert to pandas dataframe try: df = pandas2ri.ri2py_dataframe(pubs) publications[id] = df print("Success") except: print("Extraction failed for %s. Ignoring data." % name) pass return publications
def loadAffyCelsNorm(F, d): F1 = robjects.vectors.StrVector(F) #F1 = 'c('+ ','.join(["'%s'"%i for i in F])+')' E1 = R_loadAffyCelsFiles(F1, d) X1 = pandas2ri.ri2py_dataframe(E1) X1['cel'] = F return X1
def get_ds_w(path_w): robjects.r['load'](path_w) rdf = robjects.r['teste'] dfs = [] for i in range(1, len(rdf)): pd_df = pandas2ri.ri2py_dataframe(rdf[i]) idx = pd.DatetimeIndex(pd_df.iloc[:, 0]) idx = idx.tz_localize(None) pd_df.index = idx pd_df.index.name = 'time' pd_df = pd_df.iloc[:, 1:] pd_df = pd_df.loc[~pd_df.index.duplicated(keep='first')] dfs.append(pd_df) ds_w = xr.concat([df.to_xarray() for df in dfs], dim='sensor') ds_w.time.values = pd.DatetimeIndex(ds_w.time.values) sensor_names = list(rdf.names[1:]) ds_w = ds_w.assign_coords(sensor=sensor_names) ds_w = ds_w.to_array(dim='depth') return ds_w
def zscrp(csvs): for a in csvs: a0 = pd.read_csv(a) robjects.r(''' calc.zpos <- function(x) { ctrl.avgs <- x %>% group_by(rep, plt_nm) %>% filter(condt == "+ctrl") %>% dplyr::summarise(ctrlmean = mean(area), ctrlstdev = sd(area)) x.zpos <- left_join(x, ctrl.avgs, by = c("rep", "plt_nm")) %>% mutate(zpos = (area - ctrlmean)/ctrlstdev) %>% select(-c(ctrlmean, ctrlstdev)) return(x.zpos) } ''') r_f = robjects.globalenv['calc.zpos'] res = r_f(a0) r.data('res') pd_df = pandas2ri.ri2py_dataframe(res) # out = a.split('.csv') pd_df.to_csv(a, index=False)
def prep_exp(include_lab, include_ethdon, lag, eq_train_ratio, num_folds, train_thresh_year, cutoff, file_dir): import rpy2.robjects as robjects from rpy2.robjects.packages import importr from rpy2.robjects.lib.dplyr import DataFrame from rpy2.robjects.packages import STAP from rpy2.robjects import pandas2ri if eq_train_ratio: eq_cases_train_cols = np.array(['TRR_ID', 'is_diab']) else: eq_cases_train_cols = None # Read RDS files (load data table) read_rds = robjects.r['readRDS'] tx_li_study = read_rds(os.path.join(file_dir, 'tx_li_formatted.rds')) txf_li_study = read_rds(os.path.join(file_dir, 'txf_li_formatted.rds')) # Merge them cols, cov_cols, timedep_cols = get_cols(include_lab, include_ethdon, lag, file_dir) with open(os.path.join(file_dir, 'R', 'functions.R'), 'r') as f: string = f.read() functions = STAP(string, 'functions') merged = functions.combine_tx_txf(tx_li_study, txf_li_study, np.setdiff1d(cov_cols, 'age'), timedep_cols, lag) df = pandas2ri.ri2py_dataframe( DataFrame(merged).filter('time_next_followup > time_since_transplant')) # Prep data for model training - only take complete ones subset_cols = np.concatenate((['TRR_ID', 'age', 'transplant_year'], cols, [ 'is_diab', 'time_since_transplant', 'time_next_followup', 'time_to_diab', 'diab_time_since_tx', 'diab_in_1_year', 'diab_now' ])) df = df.dropna(subset=subset_cols) df_test = df[(df.transplant_year.astype(int) >= 2011) & (df.time_to_diab >= 0)] df_nontest = df[(df.transplant_year.astype(int) < 2011) & (df.transplant_year.astype(int) >= train_thresh_year) & (df.time_to_diab >= 0)] if cutoff: df_nontest = df_nontest[df_nontest.transplant_year.astype(int) + df_nontest.time_since_transplant < 2011] if num_folds > 0: nontest_y = df_nontest.drop_duplicates( subset=['TRR_ID', 'is_diab']).is_diab caret = importr('caret') folds = caret.createFolds(nontest_y.values, num_folds, False) else: folds = None return { 'test': df_test, 'train': df_nontest, 'cols': cols, 'eq_cases_train_cols': eq_cases_train_cols, 'folds': folds }
def _convert_to_python(x): if isinstance(x, DataFrame): return pandas2ri.ri2py_dataframe(x) elif isinstance(x, ListVector) or isinstance(x, Vector): return [_convert_to_python(item) for item in x] else: return np.array(x)
def test1(self): rkt = rpackages.importr('rkt') nyear = 4 nseas = 5 year = np.repeat(np.arange(2000, 2000 + nyear), nseas) dekad = np.tile(1 + np.arange(nseas), nyear) data = np.random.rand(nseas * nyear) + np.arange(nseas * nyear) * 0.1 if 1: year = robjects.IntVector(year) dekad = robjects.IntVector(dekad) data = robjects.FloatVector(data) else: year = rpyn.numpy2ri(year) dekad = rpyn.numpy2ri(dekad) data = rpyn.numpy2ri(data) print(year) print(dekad) print(data) self.res = rkt.rkt(year, data, dekad) print(self.res) df = pandas2ri.ri2py_dataframe(rw.res).transpose() df.columns = self.res.names df = df[['sl', 'S', 'B', 'varS', 'tau']] print(pd.concat([df, df, df])) self.df = df
def run_deseq2(self, exp_lib_list, ctr_lib_list, size_factors, pairwise_replicates): self._count_df = np.round(self._count_df, decimals=0) self._count_df = self._count_df.astype(int) conds = ["exp"] * len(exp_lib_list) + ["ctr"] * len(ctr_lib_list) if pairwise_replicates: samples = list(range(1, len(exp_lib_list) + 1)) + list( range(1, len(ctr_lib_list) + 1)) colData = robjects.DataFrame({ "conditions": robjects.StrVector(conds), "samples": robjects.StrVector(samples)}) design = Formula('~ samples + conditions') else: colData = robjects.DataFrame( {"conditions": robjects.StrVector(conds)}) design = Formula('~ conditions') r_count_df = robjects.DataFrame(self._count_df) r_count_df.colnames = robjects.rinterface.NULL dds = r.DESeqDataSetFromMatrix(countData=r_count_df, colData=colData, design=design) if size_factors is None: dds = r.estimateSizeFactors(dds) else: assign_sf = r["sizeFactors<-"] dds = assign_sf(object=dds, value=robjects.FloatVector( size_factors)) dds = r.estimateDispersions(dds, quiet=True) dds = r.nbinomWaldTest(dds, quiet=True) size_factors = pd.Series(r.sizeFactors(dds), index=self._count_df.columns) results = r.results(dds, contrast=robjects.StrVector( ("conditions", "exp", "ctr")), altHypothesis="greater") results_df = pandas2ri.ri2py_dataframe(r['as.data.frame'](results)) results_df.index = self._count_df.index return(results_df, size_factors)
def deaScranDESeq2(counts, conds, comparisons, alpha, scran_clusters=False): """Makes a call to DESeq2 with SCRAN to perform D.E.A. in the given counts matrix with the given conditions and comparisons. Returns a list of DESeq2 results for each comparison """ results = list() n_cells = len(counts.columns) try: pandas2ri.activate() deseq2 = RimportLibrary("DESeq2") scran = RimportLibrary("scran") multicore = RimportLibrary("BiocParallel") multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1)) as_matrix = r["as.matrix"] # Create the R conditions and counts data r_counts = pandas2ri.py2ri(counts) cond = robjects.StrVector(conds) r_call = """ function(r_counts) { sce = SingleCellExperiment(assays=list(counts=r_counts)) return(sce) } """ r_func = r(r_call) sce = r_func(as_matrix(r_counts)) if scran_clusters: r_clusters = scran.quickCluster(as_matrix(r_counts), max(n_cells/10, 10)) min_cluster_size = min(Counter(r_clusters).values()) sizes = list(set([round((min_cluster_size/2) / i) for i in [5,4,3,2,1]])) sce = scran.computeSumFactors(sce, clusters=r_clusters, sizes=sizes, positive=True) else: sizes = list(set([round((n_cells/2) * i) for i in [0.1,0.2,0.3,0.4,0.5]])) sce = scran.computeSumFactors(sce, sizes=sizes, positive=True) sce = r.normalize(sce) dds = r.convertTo(sce, type="DESeq2") r_call = """ function(dds, conditions){ colData(dds)$conditions = as.factor(conditions) design(dds) = formula(~ conditions) return(dds) } """ r_func = r(r_call) dds = r_func(dds, cond) dds = r.DESeq(dds) # Perform the comparisons and store results in list for A,B in comparisons: result = r.results(dds, contrast=r.c("conditions", A, B), alpha=alpha) result = r['as.data.frame'](result) genes = r['rownames'](result) result = pandas2ri.ri2py_dataframe(result) # There seems to be a problem parsing the rownames from R to pandas # so we do it manually result.index = genes results.append(result) pandas2ri.deactivate() except Exception as e: raise e return results
def apply_transferFunction_metric(r_stream1, r_stream2, evalresp1, evalresp2): """" Invoke a named "correlation" R metric and convert the R dataframe result into a Pandas dataframe. :param r_stream1: an r_stream object :param r_stream2: an r_stream object :param metric_function_name: the name of the set of metrics :return: """ R_function = robjects.r('IRISMustangMetrics::transferFunctionMetric') # NOTE: Conversion of dataframes only works if you activate but we don't want conversion # NOTE: to always be automatic so we deactivate() after we're done converting. pandas2ri.activate() r_evalresp1 = pandas2ri.py2ri_pandasdataframe(evalresp1) r_evalresp2 = pandas2ri.py2ri_pandasdataframe(evalresp2) pandas2ri.deactivate() # TODO: Can we just activate/deactivate before/after R_function() without converting # TODO: r_evalresp1/2 ahead of time? # Calculate the metric r_metriclist = R_function(r_stream1, r_stream2, r_evalresp1, r_evalresp2) r_dataframe = _R_metricList2DF(r_metriclist) pandas2ri.activate() df = pandas2ri.ri2py_dataframe(r_dataframe) pandas2ri.deactivate() # Convert columns from R POSIXct to pyton UTCDateTime df.starttime = df.starttime.apply(UTCDateTime) df.endtime = df.endtime.apply(UTCDateTime) return df
def getResult(request): # 获取所有的待计算数据, 并转成R可以读取的格式 ListingId = request.POST.getlist('ListingId', []) Title = request.POST.getlist('Title', []) inputAmount = request.POST['inputAmount'] Months = request.POST.getlist('Months', []) CreditCode = request.POST.getlist('CreditCode', []) Rate = request.POST.getlist('Rate', []) data = rlc.OrdDict([('ListingId', rob.StrVector(ListingId)), ('Title', rob.StrVector(Title)), ('inputAmount', rob.IntVector([inputAmount] * len(ListingId))), ('Months', rob.IntVector(Months)), ('CreditCode', rob.StrVector(CreditCode)), ('Rate', rob.FloatVector(Rate))]) inputCalDataFrame = rob.DataFrame(data) """导入R""" rFilePath = os.path.dirname(os.path.abspath(__file__)) + '/DECISION.R' rob.r.source(rFilePath) decision = rob.globalenv['DECISION'](inputCalDataFrame) decisionDataFrame = pandas2ri.ri2py_dataframe( decision) # 转为Python的DataFrame格式 """/导入R """ # 转换为输出结果 inputAmount = list(decisionDataFrame['inputAmount'])[0] resultList = [] for index, row in decisionDataFrame.iterrows(): resultList.append(row.to_dict()) return render(request, 'result.html', locals())
def get_enrichment_GO (input_vector,pcutoff=0.05,adjustmethod="BH", qcutoff=0.2,ont = "BP",input="SYMBOL", readable = True): """ Gene Ontology Enrichment Analysis (clusterProfiler Bioconductor) Args: input_vector: (obj:list) gene ids str format pcutoff: p-value threshold adjustmethod: Multiple Testing Correction method one of "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none" input: 'SYMBOL' or 'ENTREZ' ont: Gene Ontology Category "BP": biological process "CC": cellular compartment "MF: molecular function Returns: df: DataFrame with enrichment results """ enrich = robjects.r['enrichment_test_GO'] genes = robjects.StrVector(input_vector) enrichment = pandas2ri.ri2py_dataframe(enrich(genes, adjustmethod, pcutoff, qcutoff, ont=ont, input=input, readable=readable)) return(enrichment)
def _edger_func_exacttest(the_data, the_groups, fdr=0.01, lfc=1, pair=None, return_full=False): """ Run edgeR DE analysis without fitting a GLM. Instead, we just compare two groups. Only a single factor is supported. :param the_data: :param the_groups: :param fdr: :param lfc: :param pair: An iterable of two group names. If None, compare the first two groups. :return: """ if pair is None: lvl, fct = pd.factorize(the_groups) pair = fct[:2] rpair = robjects.StrVector(pair) rdata = pandas2ri.py2ri(the_data) rgroups = robjects.FactorVector(the_groups) y = r("DGEList")(rdata, group=rgroups) y = r("calcNormFactors")(y) y = r("estimateDisp")(y) et = r('exactTest')(y, rpair) if return_full: toptags = r('topTags')(et, n=r('Inf'), **{'p.value': 1.}) else: toptags = r('topTags')(et, n=r('Inf'), **{'p.value': fdr}) if len(toptags) == 0: return pd.DataFrame(columns=toptags_cols) else: tt = pandas2ri.ri2py_dataframe(toptags[toptags.names.index('table')]) if lfc is not None: tt = tt.loc[tt.loc[:, 'logFC'].abs() >= lfc] return tt
def get_enrichment_Reactome (input_vector,pcutoff=0.05,adjustmethod="BH", qcutoff=0.2, min_gs_size = 10, max_gs_size=500, organism="human"): """ Reactome Enrichment Analysis (ReactomePA Bioconductor) Args: input_vector: (obj:list) gene entrez ids str format pcutoff: p-value threshold adjustmethod: Multiple Testing Correction method one of "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none" Returns: df: DataFrame with enrichment results """ enrich = robjects.r['enrichment_test_Reactome'] genes = robjects.StrVector(input_vector) enrichment = pandas2ri.ri2py_dataframe(enrich(genes, pcutoff,adjustmethod, qcutoff, min_gs_size = 10, max_gs_size=500)) return(enrichment)
def ddx(self, contrasts=None, formula=None): if contrasts is None: contrasts = self.contrasts if formula is None: formula = "~" + "+".join(self.contrasts) df = self.data["Transcriptome Profiling"]['counts'].astype(int) design = self.metadata[contrasts].reindex(df.columns).reset_index() formula = Formula(formula) DEG = pandas2ri.ri2py_dataframe( DE_Genes(counts_df=pandas2ri.py2ri(df), design_matrix=pandas2ri.py2ri(design), design_formula=formula)).set_index("gene") # # Characteristic Direction (Multivariate statistical method) # # 0 excluded, 1 is control, 2 is perturbation # classes = self.metadata[contrasts] # # Calculate differential expression / methylation # sig_features = geode.chdir(data = self.dataframe.values, # sampleclass = classes, # genes = self.dataframe.index, # gamma = 1., # smooths covariance and reduces noise # sort = True, # calculate_sig = True, # nnull = 100, # sig_only = True, # norm_vector = False) return DEG #, pd.DataFrame(sig_features)
def mic(self): """Runs MIC analysis. Runs MIC analysis using the provided config file, or if no config file is provided, using fst.yaml in the current working directory. Will prompt if output directory already exists (use/rename/cancel) unless called with the -noprompt flag, in which case it will reuse the output directory. Will prompt if output file already exists (overwrite/rename/cancel) unless called with the -noprompt flag, in which case it will overwrite the output file. """ # create output dir outdir = os.path.join(self._config['results_directory'], 'output_mic') self.__careful_mkdir(outdir) # prepare data filtered = _filter_by_variance(self.df_x) # call R # TODO just a sec and we'll probably switch this to minepy minerva = importr('minerva') # pylint: disable=no-member mine_out = minerva.mine(filtered.values) # pylint: enable=no-member mic_out = pandas2ri.ri2py_dataframe(mine_out.rx2(1)) # restore names names_list = list(filtered.columns.values) mic_out.rename(columns=lambda x: names_list[int(x)], index=lambda x: names_list[int(x)], inplace=True) # save and return mic_out.to_csv(os.path.join(outdir, 'MIC.csv'), index_label='feature')
def run(self): self.LOG.info("Starting to simulate data...") t1 = time() simulate = self.get_r_method(c.SIMULATE_R_FILE, 'simulate') sim_data = simulate(self.model, self.nrows) pandas_df = pandas2ri.ri2py_dataframe(sim_data) self.LOG.info("Data simulation complete in %d sec." % (time() - t1)) return pandas_df.astype(int, copy=False)
def get_tf_factor(var, from_to, value_col="IMPUTED"): r_var = r['as.character'](robjects.FactorVector(var)) r_from_to = robjects.IntVector(from_to) data = r['tf_factor_tbl'](r['as.character'](r_var), r_from_to, value_col) data = pandas2ri.ri2py_dataframe(data) print(var[0]) gc.collect() return data
def fetch_data(app_struct, input_date, offset): # Run the R engine rcode = generate_rcode(app_struct.token, input_date, input_date, offset, app_struct.sandbox) r(rcode) # Get the result table = robjects.r['table'] return pandas2ri.ri2py_dataframe(table)
def _run_gsea(df, genesets, method='ssgsea', verbose=False, **kwargs): rdata = r('as.matrix')(df) rgenesets = robjects.ListVector(genesets) res = r('gsva')(rdata, rgenesets, method=method, verbose=verbose, **kwargs) py_res = pandas2ri.ri2py_dataframe(res) py_res.index = r('rownames')(res) # py_res.columns = r('colnames')(res) py_res.columns = df.columns return py_res
def normalized_count(self): normalized_count_matrix = deseq.counts_DESeqDataSet(self.dds, normalized=True) normalized_count_matrix = to_dataframe(normalized_count_matrix) # switch back to python self.normalized_count_df = pandas2ri.ri2py_dataframe( normalized_count_matrix) self.normalized_count_df[self.gene_column] = self.gene_id.values return self.normalized_count_df
def _eval_one_setting_grf(train, test, n_train, n_test, num_trees=NUM_TREES_BASE, d=1, te_function=None, baseline_model=None, propensity_model=None, covariate_model=None, error_model=None, binary_y=False, selection_bias=None, seedy=42, root=PAPER_UTILS_ROOT): # get data np.random.seed(seedy) X, y, w, t, p, _ = make_te_data(n=n_train + n_test, d=d, te_model=te_function, baseline_model=baseline_model, covariate_model=covariate_model, propensity_model=propensity_model, binary_y=binary_y, error_model=error_model, seedy=seedy, selection_bias=selection_bias) # split data X_train, y_train, w_train, p_train, _ = _safe_split_te( X, y, w, p, t, train) X_test, _, _, _, t_test = _safe_split_te(X, y, w, p, t, test) # convert to R objects r_y = robjects.FloatVector(y_train) r_x = robjects.r.matrix(X_train, n_train, d) r_w = robjects.IntVector(w_train) r_p = robjects.FloatVector(p_train) r_x_test = robjects.r.matrix(X_test, n_test, d) # get function from R script r_source = robjects.r['source'] r_source(root + 'grf_experiments.R') r_get_te_predictions = robjects.globalenv['get_te_predictions'] r_out = r_get_te_predictions(r_x, r_y, r_w, r_p, r_x_test, num_trees=num_trees) out = pandas2ri.ri2py_dataframe(r_out).values mses = [mean_squared_error(t_test, out[:, i]) for i in range(5)] return mses
def predict(self, X, n_draws=0, parallel=False): X_out = self.x_scaler.transform(X) dfout = pd.DataFrame(X_out, columns=X.columns) dfoutpath = "{}/{}.feather".format(self.outdir, uuid.uuid4()) dfout.to_feather(dfoutpath) ml = self.get_ml() out_ = ml.predict_(self.ml_, dfoutpath, n_draws, parallel, self.pacman_call) pred = pandas2ri.ri2py_dataframe(out_) os.remove(dfoutpath) return self.y_scaler.inverse_transform(pred.values)
def _edger_func_test(fit, design, contrast_str, fdr=0.01, lfc=1, return_full=False): rcontrast = r('makeContrasts')(contrast_str, levels=design) lrt = r('glmTreat')(fit, contrast=rcontrast, lfc=lfc) if return_full: toptags = r('topTags')(lrt, n=r('Inf'), **{'p.value': 1.}) else: toptags = r('topTags')(lrt, n=r('Inf'), **{'p.value': fdr}) if len(toptags) == 0: return pd.DataFrame(columns=toptags_cols) else: return pandas2ri.ri2py_dataframe(toptags[toptags.names.index('table')])
def predict(self, X, return_se: bool = False): """ Make prediction, with or without standard errors associated """ if isinstance(X, pd.DataFrame): X = X.values n, d = X.shape r_x = robjects.r.matrix(X, n, d) if return_se: # predict with var r_pred = self._grf.predict_regression_forest( self._estimator, newdata=r_x, estimate_variance=True) r_pred = np.transpose(pandas2ri.ri2py_dataframe(r_pred).values) return r_pred[:, 0], r_pred[:, 1] else: r_pred = self._grf.predict_regression_forest(self._estimator, newdata=r_x) r_pred = pandas2ri.ri2py_dataframe(r_pred).values return np.transpose(r_pred[0, :])
def _edger_tmm_normalisation_cpm(count_data): robjects = rinterface.robjects pandas2ri = rinterface.robjects.pandas2ri rdata = pandas2ri.py2ri(count_data) y = robjects.r("DGEList")(rdata) yn = robjects.r("calcNormFactors")(y) cpm = pandas2ri.ri2py_dataframe(robjects.r('cpm')(yn)) cpm.index = count_data.index cpm.columns = count_data.columns return cpm
def dataframe_to_pandas(r_frame): pd_frame = pandas2ri.ri2py_dataframe(r_frame) # Extract column names if possible. col_names = robjects.r.colnames(r_frame) if not type(col_names) == RNULLType: pd_frame.columns = col_names # Extract row names if possible. index = robjects.r.rownames(r_frame) if not type(index) == RNULLType: pd_frame.index = index return pd_frame
def MannKendall(self, data): rkt = rpackages.importr('Kendall') data = robjects.FloatVector(data) self.res = rkt.MannKendall(data) print(self.res) df = pandas2ri.ri2py_dataframe(self.res).transpose() df.columns = self.res.names df = df[['sl', 'S', 'B', 'varS', 'tau']] return df
def predict_proba(self, X): """ Computes possible class probabilities for the input 'X' Parameters ----------- X: pandas.DataFrame object Returns ------- pandas.DataFrame of shape (#datapoints, 2), the possible probability of each class for each observation """ if not isinstance(X, pd.DataFrame): raise exceptions.DataSetError("Only pandas.DataFrame as input type is currently supported") data_as_r_frame = self.__r_frame(self.__s_apply(X, self.__as_factor)) results = self.__r_sbrl.predict_sbrl(self.model, data_as_r_frame) return pandas2ri.ri2py_dataframe(results).T
def peak_table(xmcs_set, filebase="peakList"): """Export the global peak table Parameters ---------------- xcms_set : xcmsSet R xcms set. filebase : str Type of filebase to use. Returns ----------- out : dataFrame xcms peak dataFrame. """ peak = robjects.r["peakTable"] tab = peak(xmcs_set, filebase) df = pandas2ri.ri2py_dataframe(tab) df.columns = tab.colnames return df
def calculate_result(self, scores): """ Generates a csv file with the resulting assignment while it updates the status of the process using Celery """ update_frequency = 1 max_steps = 7 self.update_progress(1, max_steps, update_frequency=update_frequency) ro.r('library(MASS)') self.update_progress(2, max_steps, update_frequency=update_frequency) ro.r('library(Matrix)') self.update_progress(3, max_steps, update_frequency=update_frequency) ro.r('library(lme4)') self.update_progress(4, max_steps, update_frequency=update_frequency) ro.r('library(Rcpp)') self.update_progress(5, max_steps, update_frequency=update_frequency) ro.r('library(arm)') self.update_progress(6, max_steps, update_frequency=update_frequency) scores_pd = pd.DataFrame(scores) # estimate scores rdf = com.convert_to_r_dataframe(scores_pd) ro.globalenv['scores'] = rdf if 'Confidence' in scores_pd.columns: fit_str = 'fit <- lmer(Score ~ 1 + (1 | PaperID) + (1 | PersonID), scores, weights = Confidence)' else: fit_str = 'fit <- lmer(Score ~ 1 + (1 | PaperID) + (1 | PersonID), scores)' ro.r(fit_str) ro.r('''bayes_score <- data.frame(PaperID = rownames(fixef(fit) + ranef(fit)$PaperID), Mean = (fixef(fit) + ranef(fit)$PaperID)[,1], SD = (se.ranef(fit)$PaperID)[, 1])''') bayes_score = pandas2ri.ri2py_dataframe(ro.r('bayes_score')) self.update_progress(7, max_steps, update_frequency=update_frequency) return bayes_score.to_csv(None, na_rep='', index=False, encoding='utf-8')
def deaDESeq2(counts, conds, comparisons, alpha, size_factors=None): """Makes a call to DESeq2 to perform D.E.A. in the given counts matrix with the given conditions and comparisons. Can be given size factors. Returns a list of DESeq2 results for each comparison """ results = list() try: pandas2ri.activate() deseq2 = RimportLibrary("DESeq2") multicore = RimportLibrary("BiocParallel") multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1)) # Create the R conditions and counts data r_counts = pandas2ri.py2ri(counts) cond = robjects.DataFrame({"conditions": robjects.StrVector(conds)}) design = r('formula(~ conditions)') dds = r.DESeqDataSetFromMatrix(countData=r_counts, colData=cond, design=design) if size_factors is None: dds = r.DESeq(dds, parallel=True, useT=True, minmu=1e-6, minReplicatesForReplace=np.inf) else: assign_sf = r["sizeFactors<-"] dds = assign_sf(object=dds, value=robjects.FloatVector(size_factors)) dds = r.estimateDispersions(dds) dds = r.nbinomWaldTest(dds) # Perform the comparisons and store results in list for A,B in comparisons: result = r.results(dds, contrast=r.c("conditions", A, B), alpha=alpha, parallel=True) result = r['as.data.frame'](result) genes = r['rownames'](result) result = pandas2ri.ri2py_dataframe(result) # There seems to be a problem parsing the rownames from R to pandas # so we do it manually result.index = genes results.append(result) pandas2ri.deactivate() except Exception as e: raise e return results
def apply_correlation_metric(r_stream1, r_stream2, metric_function_name, *args, **kwargs): """" Invoke a named "correlation" R metric and convert the R dataframe result into a Pandas dataframe. :param r_stream1: an r_stream object :param r_stream2: an r_stream object :param metric_function_name: the name of the set of metrics :return: """ function = 'IRISMustangMetrics::' + metric_function_name + 'Metric' R_function = robjects.r(function) pandas2ri.activate() r_metriclist = R_function(r_stream1, r_stream2, *args, **kwargs) # args and kwargs shouldn't be needed in theory pandas2ri.deactivate() r_dataframe = _R_metricList2DF(r_metriclist) df = pandas2ri.ri2py_dataframe(r_dataframe) # Convert columns from R POSIXct to pyton UTCDateTime df.starttime = df.starttime.apply(UTCDateTime) df.endtime = df.endtime.apply(UTCDateTime) return df
def xml2df(url): # make some terrible R code from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage from rpy2.robjects import pandas2ri string = """ require(XML) require(plyr) getXML <- function(x) { xmlfile <- xmlTreeParse(x) temp = xmlToList(xmlfile, addAttributes = F) df <- ldply(temp, .fun=function(x) {data.frame(t(unlist(x)))}) return(df) } """ test = SignatureTranslatedAnonymousPackage(string, "test") # make a pandas DF out of the stupid R df pydf = pandas2ri.ri2py_dataframe(test.getXML(url)) return pydf
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--task", dest="task", type="choice", choices=["set_factors", "dichotimise_phenotype", "plink_format", "select_ethnicity", "merge_covariates", "subset_phenotypes"], help="task to execute on phenotype file(s)") parser.add_option("--R-script", dest="r_script", type="string", help="R script for table reformatting") parser.add_option("--adjustment", dest="adjust", type="choice", choices=["snp"], help="adjustements to make pre- or post mergeing") parser.add_option("--pheno-id", dest="dichot_var", type="string", help="column header of variable to be dichotimised") parser.add_option("--reference-variable", dest="ref_level", type="string", help="level of variable to be dichotimised toi set to 1") parser.add_option("--missing-var-label", dest="missing_label", type="string", help="missing/unobserved value labels") parser.add_option("--id-variable", dest="id_var", type="string", help="ID variable column header") parser.add_option("--ethnicity-id", dest="ethnic_var", type="string", help="column header for variable containing " "ethnicity data") parser.add_option("--ethnicity-label", dest="ethnic", type="string", help="ethnicity label to select samples on") parser.add_option("--covariate-file", dest="covar_file", type="string", help="a comma-separated list of files to be merged, or " "a single file") parser.add_option("--fam-file", dest="fam_file", type="string", help="Plink .fam file that specifies which samples " "to subset from the phenotypes file") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) infile = argv[-1] if options.task == "set_factors": pandas2ri.activate() R('''source("%s")''' % options.r_script) R('''format <- format_phenotypes("%s")''' % infile) pheno_df = pandas2ri.ri2py_dataframe(R["format"]) pheno_df["IID"] = pheno_df["f.eid"] cols = pheno_df.columns.tolist() cols = [xc if not re.search("f.eid", xc) else "FID" for xc in cols] # columns need to be FID, IID, ... cols.remove("FID") cols.remove("IID") new_cols = cols new_cols.insert(0, "FID") new_cols.insert(1, "IID") pheno_df.columns = new_cols pheno_df.to_csv(options.stdout, sep="\t", index_col=None) elif options.task == "dichotimise_phenotype": # catch situation where delimiter does is not tab try: df = pd.read_table(infile, sep="\t", header=0, index_col=None) assert len(df.columns) > 1 except AssertionError: df = pd.read_table(infile, sep="\s+", index_col=None) var = pd.Series(df[options.dichot_var].copy(), dtype=np.int64) ref = np.int64(options.ref_level) mask = var.isin([ref]) # set NA or unobserved to missing, assume missing value is -9 (Plink standard) nas = np.isnan(var) var[~mask] = 1 var[mask] = 2 var[nas] = -9 # there maybe multiple missing/unobserved data categories to deal with missing = options.missing_label.split(",") if len(missing) > 1: miss_mask = var.isin(missing) else: miss_mask = var.isin(missing) var[miss_mask] = -9 # output in plink format p_df = df.loc[:, ("FID", "IID", options.dichot_var)] p_df[options.dichot_var] = pd.Series(var, dtype=np.int64) p_df.index = p_df["FID"] p_df.drop(labels="FID", axis=1, inplace=True) p_df.to_csv(options.stdout, sep="\t", index_col=None) elif options.task == "plink_format": # add IID and FID columns based on individual IDs pheno_df = pd.read_table(infile, sep="\t", header=0, index_col=None) pheno_df["IID"] = pheno_df[options.id_var] pheno_df["FID"] = pheno_df[options.id_var] cols = pheno_df.columns.tolist() cols = [xc for xc in cols if not re.search(options.id_var, xc)] # columns need to be FID, IID, ... cols.remove("FID") cols.remove("IID") new_cols = cols new_cols.insert(0, "FID") new_cols.insert(1, "IID") resort_df = pheno_df[new_cols] resort_df.index = resort_df["FID"] resort_df.drop(labels="FID", axis=1, inplace=True) resort_df.to_csv(options.stdout, sep="\t") elif options.task == "select_ethnicity": # select ethnicity pheno_df = pd.read_table(infile, sep="\t", header=0, index_col=None) ethnic_var = pheno_df.loc[:, options.ethnic_var].copy() ethnic_mask = ethnic_var == int(options.ethnic) select_indv = ethnic_var[ethnic_mask].index filter_df = pheno_df.loc[select_indv, :] filter_df.index = filter_df["FID"] filter_df.drop(labels="FID", axis=1, inplace=True) filter_df.to_csv(options.stdout, sep="\t", index_col=None) elif options.task == "merge_covariates": if len(options.covar_file.split(",")) > 1: filelist = options.covar_file.split(",") df = pd.read_table(filelist.pop(0), sep="\t", index_col=None, header=0) if options.adjust == "snp": re_snp = re.compile(".raw") snp_file = [fil for fil in filelist if re.search(re_snp, fil)][0] _df = pd.read_table(snp_file, sep="\t", header=0, index_col=None) cols = _df.columns[6:] real_cols = list(_df.columns[:6]) snp_cols = [sc.split("_")[:-1][0] for sc in cols] # list methods work in place, don't assign as a new variable real_cols.extend(snp_cols) _df.columns = real_cols df = pd.merge(left=df, right=_df, on=["FID", "IID"], how='inner') try: filelist.remove(snp_file) except: pass for fle in filelist: _df = pd.read_table(fle, sep="\t", header=0, index_col=None) df = pd.merge(left=df, right=_df, on=["FID", "IID"], how='inner') # python outputs NA as blank when writing to stdout, # plink expects values, use string NAs df = df.fillna("NA") df.index = df["FID"] df.drop(["FID"], inplace=True, axis=1) df.to_csv(options.stdout, index_col=0, index_label="FID", sep="\t") else: E.warn("only a single covariates file provided." "No merging possible, exiting") elif options.task == "subset_phenotypes": fam_df = pd.read_table(options.fam_file, sep=None, index_col=None, header=None) fam_df.columns = ["FID", "IID", "PAT", "MAT", "SEX", "PHENO"] pheno_df = pd.read_table(infile, sep=None, index_col=0, header=0) fam_ids = fam_df["FID"] sub_pheno = pheno_df.loc[fam_ids] sub_pheno.to_csv(options.stdout, index_col=0, index_label="FID", sep="\t") else: pass # write footer and output benchmark information. E.Stop()
pop_freq = r('bs_imp$pop.freq') pop_freq_names = pandas2ri.ri2py(pop_freq.names) # %% def compute_He(elem): He = 2 for x in elem: He *= x return He He_dict = {} for i, name in enumerate(pop_freq_names): af = pandas2ri.ri2py_dataframe(pop_freq.rx2(name)) af.columns = [x+1 for x in af.columns] He_dict[name] = af.apply(compute_He).to_dict() if i % 10000 == 0: print("at %d" % i) # %% He = pd.DataFrame(He_dict).T # %% He.columns = [popid_map[x] for x in He.columns] # %% Ho_He = Ho.join(He, lsuffix = "_Ho", rsuffix = "_He")
def testColoc(trait1, trait2, trait1_type, trait2_type, maf_table, gene_list=None, trait1_prev=None, trait2_prev=None, chromosome=None, start=None, end=None): ''' Perform colocalization testing between two traits. Arguments ------ trait1: pandas.core.dataframe A data frame containing the summary statistics for trait 1 trait2: pandas.core.dataframe A data frame containing the summary statistics for trait 2 trait1_type: string Either `cc` or `quant`, denoting the type of trait 1 trait2_type: string Either `cc` or `quant`, denoting the type of trait 2 maf_table: pandas.core.dataframe Data frame containing SNP IDs and MAF gene_list: list A list of genes to restirct analysis to. Either trait 1 or trait 2 must be a quantitative trait trait1_prev: float Prevalence of trait1 if binary trait2_prev: float Prevalence of trait2 if binary chromosome: int Chromosome to restrict the colocalisation analysis to start: int start co-ordinate to restrict analysis to. Must also provide `chromosome`. 1-based index, closed [start, end] end: int end co-ordinate to restrict analysis to. Must also provide `chromosome` and `start`. 1-based index, closed [start, end] Returns ------- coloc_results: pandas.core.dataframe A data frame containing each region (e.g. genes) and the posterior probability in favour of each hypothesis: H0 - no association with trait1 or trait2, and no colocalisation H1 - association with trait 1, but no colocalisation H2 - association with trait2, but no colocalisation H3 - association with trait1 and 2, but no colocalisation H4 - association with trait1 and 2, and colocalised ''' # push all elements into the R environment R('''sink(file="sink.text")''') R('''suppressPackageStartupMessages(library(coloc))''') R('''source("/ifs/devel/projects/proj045/gwas_pipeline/R_scripts/coloQtl.R")''') E.info("Pushing results tables into R environment") py2ri.activate() r_trait1 = py2ri.py2ri_pandasdataframe(trait1) R.assign("r.trait1", r_trait1) r_trait2 = py2ri.py2ri_pandasdataframe(trait2) R.assign("r.trait2", r_trait2) r_maf = py2ri.py2ri_pandasdataframe(maf_table) R.assign("r.mafs", r_maf) if trait1_prev: R.assign("trait1.prev", trait1_prev) else: R('''trait1.prev <- NULL''') if trait2_prev: R.assign("trait2.prev", trait2_prev) else: R('''trait2.prev <- NULL''') E.info("Checking for gene list") if gene_list: E.info("Gene list contains {} genes".format(len(set(gene_list)))) r_genes = ro.StrVector([rx for rx in set(gene_list)]) R.assign("gene.list", r_genes) E.info("Iterating over gene list") R('''res.df <- geneListSnpColocQtl(gene_list=gene.list,''' '''results_table=r.trait1, MAF_table=r.mafs, ''' '''eqtl_table=r.trait2, trait_type="%(trait1_type)s", ''' '''prev=trait1.prev)''' % locals()) R('''genes <- rownames(res.df)''') genes = [gx for gx in R["genes"]] else: R('''res.df <- TwoTraitSnpColocQtl(trait1_table=r.trait1,''' '''trait2_table=r.trait2, MAF_table=r.mafs, ''' '''trait1_type="%(trait1_type)s", trait2_type="%(trait2_type)s",''' '''prev1=trait1.prev, prev2=trait2.prev)''') R('''genes <- dim(res.df)[1]''') genes = R["genes"] coloc_results = py2ri.ri2py_dataframe(R["res.df"]) coloc_results.index = genes coloc_results.columns = ["nSNPs", "H0.PP", "H1.PP", "H2.PP", "H3.PP", "H4.PP"] R('''sink(file=NULL)''') return coloc_results
def pythonWrapper4Pet(dataframe, snps, covars, trait1, trait2, model1, model2, resamples=999): ''' This is just Python wrapper around the R code for the PET calculations ''' py2ri.activate() E.info("Checking regression models") if model1 == "logistic": R('''trait1.mod <- binomial''') R('''trait1.link <- "logit" ''') elif model1 == "linear": R('''trait1.mod <- gaussian''') R('''trait1.link <- "identity" ''') if model2 == "logistic": R('''trait2.mod <- binomial''') R('''trait2.link <- "logit" ''') elif model2 == "linear": R('''trait2.mod <- gaussian''') R('''trait2.link <- "identity" ''') E.info("Running {} regression for trait 1: {}".format(model1, trait1)) E.info("Running {} regression for trait 2: {}".format(model2, trait2)) R('''source("/ifs/devel/projects/proj045/gwas_pipeline/R_scripts/PET_functions.R")''') E.info("Pushing data objects into the R environment") # push everything into the R environment r_df = py2ri.py2ri_pandasdataframe(dataframe) R.assign("data.df", r_df) r_snps = ro.StrVector([sp for sp in snps]) R.assign("snp.list", r_snps) E.info("Parsing covariates") covars = covars.split(",") r_covar = ro.StrVector([cv for cv in covars]) R.assign("covar.list", r_covar) E.info("{} covariates found to adjust " "in regression models".format(len(covars))) # clean up, replacing "missing values" with NAs for R R('''data.df[data.df == -9] <- NA''') R('''pet_results <- list()''') # loop over all SNP, calculate PCC and p-value # this takes a long time <- need to think of speed ups # possible Python-pure implementation, i.e. with LIMIX? E.info("Iteratively calculating PCC for all SNPs") R('''results <- loopPET(data.df=data.df, trait1="%(trait1)s", trait2="%(trait2)s", ''' '''trait1.link=trait1.link, trait2.link=trait2.link, ''' '''trait1.mod=trait1.mod, trait2.mod=trait2.mod, covars=covar.list,''' '''resamples=%(resamples)i, snp.list=snp.list)''' % locals()) R('''out.res <- data.frame(do.call(rbind, results))''') R('''colnames(out.res) <- c("PCC", "pvalue")''') py_out = py2ri.ri2py_dataframe(R["out.res"]) return py_out
def applymem(df, discarded_seasons=None, wdw_method=2, lower_bound=5.0): rdf = pandas2ri.py2ri(df) seasons = sorted(list(df.columns.drop(['UF', 'epiweek'])))[:-1] # Discard 2009 season if present: seasons = sorted(set(seasons).difference(discarded_seasons)) rseasons = ro.StrVector(seasons) ro.globalenv['df'] = rdf ro.globalenv['seasons'] = rseasons # # Method for obtaining typical time series evolution (default 2) # ro.globalenv['par.type.curve'] = 2 # # Method for obtaining pre/post-epidemic threshold (default 4) # ro.globalenv['par.type.threshold'] = 2 # # Method for obtaining intensity thresholds (default 4) # ro.globalenv['par.type.intensity'] = 2 # # Method for obtaining outbreak start and length (default 6) # ro.globalenv['par.type.other'] = 2 # # Total number of points to obtain pre/post-threshold (will take n/seasons from each) # ro.globalenv['par.n.max'] = 30 # # Confidence interval for modelled curve # ro.globalenv['par.level.curve'] = 0.90 # # Confidence interval for pre/post-thresold # ro.globalenv['par.level.threshold'] = 0.95 # # Quantiles for intensity thresholds # ro.globalenv['par.level.intensity'] = ro.FloatVector([0.40, 0.90, 0.975]) # # epimemrslt = ro.r('memmodel(i.data=subset(df, select=seasons), i.type.curve=par.type.curve,' + # 'i.type.threshold=par.type.threshold, i.type.intensity=par.type.intensity,' + # 'i.type.other=par.type.other, i.n.max=par.n.max, i.level.curve=par.level.curve,' + # 'i.level.threshold=par.level.threshold, i.level.intensity=par.level.intensity)') ro.globalenv['df'] = rdf ro.globalenv['seasons'] = rseasons ro.globalenv['par.method'] = wdw_method ro.globalenv['par.type.curve'] = 2 ro.globalenv['par.n.max'] = 20 ro.globalenv['par.level.curve'] = 0.95 ro.globalenv['par.level.threshold'] = 0.95 ro.globalenv['par.type.intensity'] = 6 ro.globalenv['par.level.intensity'] = ro.FloatVector([0.40, 0.90, 0.975]) epimemrslt = ro.r('memmodel(i.data=subset(df, select=seasons), i.type.curve=par.type.curve, i.method=par.method,' + 'i.n.max=par.n.max, i.level.curve=par.level.curve, i.level.threshold=par.level.threshold,' + 'i.type.intensity=par.type.intensity, i.level.intensity=par.level.intensity)') # Pre-epidemic threshold: epithreshold = max(lower_bound, pandas2ri.ri2py_dataframe(epimemrslt.rx2('pre.post.intervals')).loc[0, 2]) typrealcurve = pandas2ri.ri2py_dataframe(epimemrslt.rx2('typ.real.curve')) # Check for seasons below threshold: dropseasons = set() for s in seasons: if df[s].max() < epithreshold: dropseasons.add(s) # Drop seasons below threshold and rerun algorithm: episeasons = list(seasons) if len(dropseasons) > 0 and len(dropseasons) < len(seasons): episeasons = sorted(list(set(seasons).difference(dropseasons))) ro.globalenv['episeasons'] = ro.StrVector(episeasons) # epimemrslt = ro.r('memmodel(i.data=subset(df, select=episeasons), i.type.curve=par.type.curve,' + # 'i.type.threshold=par.type.threshold, i.type.intensity=par.type.intensity,' + # 'i.type.other=par.type.other, i.n.max=par.n.max, i.level.curve=par.level.curve,' + # 'i.level.threshold=par.level.threshold, i.level.intensity=par.level.intensity)') epimemrslt = ro.r('memmodel(i.data=subset(df, select=episeasons), i.type.curve=par.type.curve,' + 'i.method=par.method,' + 'i.n.max=par.n.max, i.level.curve=par.level.curve, i.level.threshold=par.level.threshold,' + 'i.type.intensity=par.type.intensity, i.level.intensity=par.level.intensity)') # Store results in python dictionary of objects pyepimemrslt = {} rovector = [ro.vectors.StrVector, ro.vectors.IntVector, ro.vectors.FloatVector, ro.vectors.Vector] for name in epimemrslt.names: rdata = epimemrslt.rx2(name) if name == 'call': pyepimemrslt.update({name: str(rdata)}) elif type(rdata) in rovector: pyepimemrslt.update({name: pandas2ri.ri2py_vector(rdata)}) else: pyepimemrslt.update({name: pandas2ri.ri2py_dataframe(rdata)}) # typ.curve is the typical curve obtained from averaging over epidemic seasons with time rescaled # so that the start of the epidemic period coincides with mean.start pyepimemrslt['typ.curve'].rename(columns={0: 'baixo', 1: 'mediano', 2: 'alto'}, inplace=True) pyepimemrslt['typ.curve']['mediano'].fillna(0, inplace=True) pyepimemrslt['typ.curve']['baixo'] = pyepimemrslt['typ.curve']['baixo'].where( pyepimemrslt['typ.curve']['baixo'] >= 0, other=0) pyepimemrslt['typ.curve']['baixo'] = pyepimemrslt['typ.curve']['baixo']. \ where((-pyepimemrslt['typ.curve']['baixo'].isnull()), other=pyepimemrslt['typ.curve']['mediano']) pyepimemrslt['typ.curve']['alto'] = pyepimemrslt['typ.curve']['alto']. \ where((-pyepimemrslt['typ.curve']['alto'].isnull()), other=pyepimemrslt['typ.curve']['mediano']) pyepimemrslt['typ.threshold.curve'].rename(columns={0: 'baixo', 1: 'mediano', 2: 'alto'}, inplace=True) pyepimemrslt['typ.threshold.curve']['mediano'].fillna(0, inplace=True) pyepimemrslt['typ.threshold.curve']['baixo'] = pyepimemrslt['typ.threshold.curve']['baixo']. \ where(pyepimemrslt['typ.threshold.curve']['baixo'] >= 0, other=0) pyepimemrslt['typ.threshold.curve']['baixo'] = pyepimemrslt['typ.threshold.curve']['baixo']. \ where((-pyepimemrslt['typ.threshold.curve']['baixo'].isnull()), other=pyepimemrslt['typ.threshold.curve']['mediano']) pyepimemrslt['typ.threshold.curve']['alto'] = pyepimemrslt['typ.threshold.curve']['alto']. \ where((-pyepimemrslt['typ.threshold.curve']['alto'].isnull()), other=pyepimemrslt['typ.threshold.curve']['mediano']) pyepimemrslt['pre.post.intervals'].rename(index={0: 'pre', 1: 'post'}, inplace=True) # typ.real.curve is the typical curve without time shift, that is, respecting the original weeks from data # this curve is better to keep all seasons, not only the epidemic ones. pyepimemrslt['typ.real.curve'] = typrealcurve.copy() pyepimemrslt['typ.real.curve'].rename(columns={0: 'baixo', 1: 'mediano', 2: 'alto'}, inplace=True) pyepimemrslt['typ.real.curve']['mediano'].fillna(0, inplace=True) pyepimemrslt['typ.real.curve'].loc[pyepimemrslt['typ.real.curve']['baixo'] < 0, 'baixo'] = 0 pyepimemrslt['typ.real.curve']['baixo'] = pyepimemrslt['typ.real.curve']['baixo']. \ where((-pyepimemrslt['typ.real.curve']['baixo'].isnull()), other=pyepimemrslt['typ.real.curve']['mediano']) pyepimemrslt['typ.real.curve']['alto'] = pyepimemrslt['typ.real.curve']['alto']. \ where((-pyepimemrslt['typ.real.curve']['alto'].isnull()), other=pyepimemrslt['typ.real.curve']['mediano']) newcols = {} for k, v in enumerate(episeasons): newcols[k] = str(v) + ' transladado' pyepimemrslt['moving.epidemics'].rename(columns=newcols, inplace=True) return pyepimemrslt, dropseasons
# This data frame contains the following columns: # # - type: Tumor DNA profile (1=Aneuploid Tumor, 2=Diploid Tumor) # - time: Time to death or on-study time, weeks # - delta Death indicator (0=alive, 1=dead) # In[3]: # Load in data get_ipython().magic(u'R data(tongue)') # Pull data into python kernel get_ipython().magic(u'Rpull tongue') # Convert into pandas dataframe from rpy2.robjects import pandas2ri tongue = pandas2ri.ri2py_dataframe(tongue) # We can now refer to `tongue` using both R and python. # In[4]: get_ipython().run_cell_magic(u'R', u'', u'summary(tongue)') # In[5]: tongue.describe() # We can even operate on R and Python within the same code cell.