def test_convert_r_dataframe(self): is_na = robj.baseenv.get("is.na") seriesd = tm.getSeriesData() frame = pd.DataFrame(seriesd, columns=['D', 'C', 'B', 'A']) # Null data frame["E"] = [np.nan for item in frame["A"]] # Some mixed type data frame["F"] = ["text" if item % 2 == 0 else np.nan for item in range(30)] r_dataframe = com.convert_to_r_dataframe(frame) assert np.array_equal( com.convert_robj(r_dataframe.rownames), frame.index) assert np.array_equal( com.convert_robj(r_dataframe.colnames), frame.columns) assert all(is_na(item) for item in r_dataframe.rx2("E")) for column in frame[["A", "B", "C", "D"]]: coldata = r_dataframe.rx2(column) original_data = frame[column] assert np.array_equal(com.convert_robj(coldata), original_data) for column in frame[["D", "E"]]: for original, converted in zip(frame[column], r_dataframe.rx2(column)): if pd.isnull(original): assert is_na(converted) else: assert original == converted
def test_convert_r_dataframe(self): is_na = robj.baseenv.get("is.na") seriesd = tm.getSeriesData() frame = pd.DataFrame(seriesd, columns=["D", "C", "B", "A"]) # Null data frame["E"] = [np.nan for item in frame["A"]] # Some mixed type data frame["F"] = ["text" if item % 2 == 0 else np.nan for item in range(30)] r_dataframe = com.convert_to_r_dataframe(frame) assert np.array_equal(com.convert_robj(r_dataframe.rownames), frame.index) assert np.array_equal(com.convert_robj(r_dataframe.colnames), frame.columns) assert all(is_na(item) for item in r_dataframe.rx2("E")) for column in frame[["A", "B", "C", "D"]]: coldata = r_dataframe.rx2(column) original_data = frame[column] assert np.array_equal(com.convert_robj(coldata), original_data) for column in frame[["D", "E"]]: for original, converted in zip(frame[column], r_dataframe.rx2(column)): if pd.isnull(original): assert is_na(converted) else: assert original == converted
def SCCA_r(X,Y, n_components, pen): df_X = pd.DataFrame(X) df_Y = pd.DataFrame(Y) rmat_X = com.convert_to_r_matrix(df_X) rmat_Y = com.convert_to_r_matrix(df_Y) ri.globalenv['X'] = rmat_X ri.globalenv['Y'] = rmat_Y com.r( """ out <- CCA(x = X, z = Y, K = %i, niter = 100, standardize = FALSE, penaltyx = %f, penaltyz = %f) """ % (n_components, pen[0], pen[1])) # convert the results back to dataframes and then to numpy arrays df_u = com.convert_robj(com.r('out[1]'))['u'] df_v = com.convert_robj(com.r('out[2]'))['v'] cors = com.convert_robj(com.r('out[16]'))['cors'] x_loadings = df_u.as_matrix() y_loadings = df_v.as_matrix() cors = np.array(cors) loadings = (x_loadings, y_loadings) return loadings, cors
def sample(self, niter, thin=1, variables=None, run_diagnostic=True): """ variables: if None, use all as extracted with self.get_variables(which='unobserved') """ if not self._burnin_ok: print "WARNING: you might want to run burnin() first" if variables==None: variables=self.get_variables(which='unobserved') robj.r.assign('pyjags_variables', np.array(variables)) with capture_output() as io: # get rid of some remaining output robj.r(_R_sample_dic.format(niter=niter, thin=thin)) ## temporarily disable numpy conversion rpy2.robjects.numpy2ri.deactivate() if run_diagnostic: robj.r('pyjags_gelman=gelman.diag(pyjags_samp$samples)$psrf') self._gelmandiag_last_run=com.convert_robj(robj.r('pyjags_gelman')) if np.any(self._gelmandiag_last_run.iloc[:,0]>1.05): print "WARNING: there may be problems with your convergence (some R>1.05)" else: self._gelmandiag_last_run=None ms=com.convert_robj(robj.r('as.matrix(pyjags_samp$samples)')) self._dic_last_run=com.convert_robj(robj.r('pyjags_samp$dic')) ## enable numpy conversion again rpy2.robjects.numpy2ri.activate() return ms
def test_convert_r_matrix(self): is_na = robj.baseenv.get("is.na") seriesd = tm.getSeriesData() frame = pd.DataFrame(seriesd, columns=["D", "C", "B", "A"]) # Null data frame["E"] = [np.nan for item in frame["A"]] r_dataframe = com.convert_to_r_matrix(frame) assert np.array_equal(com.convert_robj(r_dataframe.rownames), frame.index) assert np.array_equal(com.convert_robj(r_dataframe.colnames), frame.columns) assert all(is_na(item) for item in r_dataframe.rx(True, "E")) for column in frame[["A", "B", "C", "D"]]: coldata = r_dataframe.rx(True, column) original_data = frame[column] assert np.array_equal(com.convert_robj(coldata), original_data) # Pandas bug 1282 frame["F"] = ["text" if item % 2 == 0 else np.nan for item in range(30)] try: wrong_matrix = com.convert_to_r_matrix(frame) except TypeError: pass except Exception: raise
def _ccaPermute(self, X, Z, **params): """Performs CCA.permute from the PMA package to see which penalty values are better""" pma = importr("PMA") kwParams = {"typex": "standard", "typez": "standard", "trace": True} kwParams.update(params) print("\tCCA permute parameters:", kwParams) cca_permute = ro.r['CCA.permute'](X, Z, **kwParams) header = [ 'penaltyxs', 'penaltyzs', 'zstats', 'pvals', 'cors', 'ft.corperms', 'nnonzerous', 'nnonzerovs' ] header2 = [ "X Penalty", "Z Penalty", "Z-Stat", "P-Value", "Cors", "FT(Cors)", "# U's Non-Zero", "# Vs Non-Zero" ] cca_permute = {k: v for k, v in list(cca_permute.items())} df = pd.DataFrame( {h: com.convert_robj(cca_permute[h]) for h in header}, columns=header) df.columns = header2 df.index = range(1, 18) print("\n", df) print() print("Best L1 bound for x: %.5f" % com.convert_robj(cca_permute["bestpenaltyx"])[0]) print("Best L1 bound for z: %.5f" % com.convert_robj(cca_permute["bestpenaltyz"])[0])
def ccaOutcomesVsControls(self, penaltyX = None, penaltyZ = None, NAthresh = 4): """Performs CCA using controls and outcomes, no language""" (groups, allOutcomes, controls) = self.outcomeGetter.getGroupsAndOutcomes() # groups: set(group_ids) # allOutcomes: {outcome: {group_id: value}} # controls: {control: {group_id: value}} print "X: controls\nZ: outcomes" Zdict = allOutcomes Xdict = controls # R doesn't handle '$'s in column names Xdict = {k.replace('$','.'):v for k, v in Xdict.iteritems()} Zdict = {k.replace('$','.'):v for k, v in Zdict.iteritems()} Xdf = pd.DataFrame(data=Xdict) Zdf = pd.DataFrame(data=Zdict) # X, Z, Xfreqs, Zfreqs = self.prepMatrices(Xdf,Zdf, NAthresh = NAthresh, softImputeXtoo=True) X, Z, Xfreqs, Zfreqs = self.prepMatricesTogether(Xdf,Zdf, NAthresh = NAthresh) kwParams = {} if penaltyX: kwParams['penaltyx'] = penaltyX if penaltyZ: kwParams['penaltyz'] = penaltyZ # kwParams['upos'] = True # kwParams['vneg'] = True cca = self._cca(X,Z, **kwParams) Xcomp = com.convert_robj(cca['u']) # Controls Zcomp = com.convert_robj(cca['v']) # Outcomes d = com.convert_robj(cca['d']) # Something self.model = { 'u': Xcomp, 'v': Zcomp, 'd': d, } featureNames = X.columns Xcomp.index = [i.strip("X") for i in featureNames] Xfreqs = {k.strip("X"): v for k,v in Xfreqs.iteritems()} Xcomp.columns = ["%.2d_comp" % i for i in xrange(Xcomp.shape[1])] outcomeNames = Z.columns Zcomp.index = [i.strip("X") for i in outcomeNames] Zfreqs = {k.strip("X"): v for k,v in Zfreqs.iteritems()} Zcomp.columns = ["%.2d_comp" % i for i in xrange(Zcomp.shape[1])] Zcomp2 = pd.concat([Xcomp, Zcomp]) Xcomp_dict = {k: {i:(j, 0.0 if j != 0 else 1, cca["nGroups"], Xfreqs[i]) for i, j in v.iteritems()} for k, v in Xcomp.to_dict().iteritems()} Zcomp_dict = {k: {i:(j,0.0 if j != 0 else 1,cca["nGroups"], Zfreqs[i] if i in Zfreqs.keys() else Xfreqs[i] ) for i, j in v.iteritems()} for k, v in Zcomp2.to_dict().iteritems()} d_dict = dict(zip(Zcomp.columns,d)) return Xcomp_dict, Zcomp_dict, d_dict
def ccaOutcomesVsControls(self, groupFreqThresh = 0, penaltyX = None, penaltyZ = None, NAthresh = 4): """Performs CCA using controls and outcomes, no language""" (groups, allOutcomes, controls) = self.outcomeGetter.getGroupsAndOutcomes(groupFreqThresh) # groups: set(group_ids) # allOutcomes: {outcome: {group_id: value}} # controls: {control: {group_id: value}} print "X: controls\nZ: outcomes" Zdict = allOutcomes Xdict = controls # R doesn't handle '$'s in column names Xdict = {k.replace('$','.'):v for k, v in Xdict.iteritems()} Zdict = {k.replace('$','.'):v for k, v in Zdict.iteritems()} Xdf = pd.DataFrame(data=Xdict) Zdf = pd.DataFrame(data=Zdict) # X, Z, Xfreqs, Zfreqs = self.prepMatrices(Xdf,Zdf, NAthresh = NAthresh, softImputeXtoo=True) X, Z, Xfreqs, Zfreqs = self.prepMatricesTogether(Xdf,Zdf, NAthresh = NAthresh) kwParams = {} if penaltyX: kwParams['penaltyx'] = penaltyX if penaltyZ: kwParams['penaltyz'] = penaltyZ # kwParams['upos'] = True # kwParams['vneg'] = True cca = self._cca(X,Z, **kwParams) Xcomp = com.convert_robj(cca['u']) # Controls Zcomp = com.convert_robj(cca['v']) # Outcomes d = com.convert_robj(cca['d']) # Something self.model = { 'u': Xcomp, 'v': Zcomp, 'd': d, } featureNames = X.columns Xcomp.index = [i.strip("X") for i in featureNames] Xfreqs = {k.strip("X"): v for k,v in Xfreqs.iteritems()} Xcomp.columns = ["%.2d_comp" % i for i in xrange(Xcomp.shape[1])] outcomeNames = Z.columns Zcomp.index = [i.strip("X") for i in outcomeNames] Zfreqs = {k.strip("X"): v for k,v in Zfreqs.iteritems()} Zcomp.columns = ["%.2d_comp" % i for i in xrange(Zcomp.shape[1])] Zcomp2 = pd.concat([Xcomp, Zcomp]) Xcomp_dict = {k: {i:(j, 0.0 if j != 0 else 1, cca["nGroups"], Xfreqs[i]) for i, j in v.iteritems()} for k, v in Xcomp.to_dict().iteritems()} Zcomp_dict = {k: {i:(j,0.0 if j != 0 else 1,cca["nGroups"], Zfreqs[i] if i in Zfreqs.keys() else Xfreqs[i] ) for i, j in v.iteritems()} for k, v in Zcomp2.to_dict().iteritems()} d_dict = dict(zip(Zcomp.columns,d)) return Xcomp_dict, Zcomp_dict, d_dict
def _ccaPermute(self, X, Z, **params): """Performs CCA.permute from the PMA package to see which penalty values are better""" pma = importr("PMA") kwParams = {"typex": "standard", "typez": "standard", "trace": True} kwParams.update(params) print "\tCCA permute parameters:", kwParams cca_permute = ro.r['CCA.permute'](X, Z, **kwParams) header = ['penaltyxs', 'penaltyzs', 'zstats', 'pvals','cors', 'ft.corperms', 'nnonzerous', 'nnonzerovs'] header2 = ["X Penalty", "Z Penalty", "Z-Stat", "P-Value", "Cors", "FT(Cors)", "# U's Non-Zero", "# Vs Non-Zero"] cca_permute = {k:v for k,v in cca_permute.items()} df = pd.DataFrame({h:com.convert_robj(cca_permute[h]) for h in header}, columns=header) df.columns = header2 df.index = xrange(1,18) print "\n", df print print "Best L1 bound for x: %.5f" % com.convert_robj(cca_permute["bestpenaltyx"])[0] print "Best L1 bound for z: %.5f" % com.convert_robj(cca_permute["bestpenaltyz"])[0]
def sample(self, niter, thin=1, variables=None, run_diagnostic=True): """ variables: if None, use all as extracted with self.get_variables(which='unobserved') """ if not self._burnin_ok: print "WARNING: you might want to run burnin() first" if variables == None: variables = self.get_variables(which='unobserved') robj.r.assign('pyjags_variables', np.array(variables)) with capture_output() as io: # get rid of some remaining output robj.r(_R_sample_dic.format(niter=niter, thin=thin)) ## temporarily disable numpy conversion rpy2.robjects.numpy2ri.deactivate() if run_diagnostic: robj.r('pyjags_gelman=gelman.diag(pyjags_samp$samples)$psrf') self._gelmandiag_last_run = com.convert_robj( robj.r('pyjags_gelman')) if np.any(self._gelmandiag_last_run.iloc[:, 0] > 1.05): print "WARNING: there may be problems with your convergence (some R>1.05)" else: self._gelmandiag_last_run = None ms = com.convert_robj(robj.r('as.matrix(pyjags_samp$samples)')) self._dic_last_run = com.convert_robj(robj.r('pyjags_samp$dic')) ## enable numpy conversion again rpy2.robjects.numpy2ri.activate() return ms
def test_convert_r_matrix(self): is_na = robj.baseenv.get("is.na") seriesd = tm.getSeriesData() frame = pd.DataFrame(seriesd, columns=['D', 'C', 'B', 'A']) # Null data frame["E"] = [np.nan for item in frame["A"]] r_dataframe = com.convert_to_r_matrix(frame) assert np.array_equal( com.convert_robj(r_dataframe.rownames), frame.index) assert np.array_equal( com.convert_robj(r_dataframe.colnames), frame.columns) assert all(is_na(item) for item in r_dataframe.rx(True, "E")) for column in frame[["A", "B", "C", "D"]]: coldata = r_dataframe.rx(True, column) original_data = frame[column] assert np.array_equal(com.convert_robj(coldata), original_data) # Pandas bug 1282 frame["F"] = ["text" if item % 2 == 0 else np.nan for item in range(30)] try: wrong_matrix = com.convert_to_r_matrix(frame) except TypeError: pass except Exception: raise
def test_fit_with_pandas_data(self, Model, dataframe): X, y = dataframe model = Model(scriptname='myscript', funcname='myfunc', some='kwarg') model.fit(X, y) funcargs = model.r['myfunc'].call_args assert (convert_robj(funcargs[0][0]) == X).all().all() assert (convert_robj(funcargs[0][1]) == y).all() assert funcargs[1]['some'] == 'kwarg'
def to_py(o, skip_list=False): """ Converts to python object if possible. Otherwise wraps in ROBjectWrapper """ res = None try: rcls = o.do_slot("class") rcls = list(rcls) except LookupError as le: rcls = [] try: rclass = list(o.rclass) except: rclass = [] classes = rclass + rcls if isinstance(o, SexpVector) and len(classes) > 0: if 'xts' in classes: res = rconv.convert_xts_to_df(o) elif 'POSIXct' in classes: res = rconv.convert_posixct_to_index(o) elif 'logical' in classes: res = rcommon._convert_vector(o) if res is None and isinstance(o, DataFrame): res = rcommon.convert_robj(o) if res is None and isinstance(o, ListVector) and not skip_list: res = convert_ListVector(o) if res is None: try: res = rcommon.convert_robj(o) # fallback to pandas except: pass try: if len(res) == 1: return res[0] except: pass if res is None and isinstance(o, SexpVector): res = RObjectWrapper(o) if res is None: res = o return res
def runHW(param,meta): try: meta=pd.read_pickle(param['dspath']+param['dsname']+'.hw.df') except: geno=meta[param['biallele']] ro.r('library(HardyWeinberg)') ro.globalenv['geno'] = com.convert_to_r_dataframe(geno) index,pval=zip(* map(lambda (k,v): (int(k),v['pval']),com.convert_robj(ro.r('apply(geno,1,function(x) HWExact(as.numeric(x)))')).items())) pval=pd.DataFrame(map(lambda x: x[0],pval),index, columns=['pval']) meta=pd.merge(meta,pval,left_index=True,right_index=True, how='left') index,f=zip(* map(lambda (k,v): (int(k),v['f']),com.convert_robj(ro.r('apply(geno,1,function(x) HWChisq(as.numeric(x)))')).items())) f=pd.DataFrame(map(lambda x: x[0],f),index, columns=['f']) meta=pd.merge(meta,f,left_index=True,right_index=True, how='left') meta.to_pickle(param['dspath']+param['dsname']+'.hw.df') return meta
def transform(self, method="vst", inplace=True): """ perform transformation on counts table current methods are: - deseq2 variance stabalising transformation - deseq rlog transformation """ assert method in ["vst", "rlog"], "method must be one of" "[vst, rlog]" method2function = {"vst": "varianceStabilizingTransformation", "rlog": "rlog"} t_function = method2function[method] transform = R( """ function(df){ suppressMessages(library('DESeq2')) design = data.frame(row.names = colnames(df), condition = seq(1, length(colnames(df)))) dds <- suppressMessages(DESeqDataSetFromMatrix( countData= df, colData = design, design = ~condition)) transformed <- suppressMessages(%(t_function)s(dds)) transformed_df <- as.data.frame(assay(transformed)) return(transformed_df) }""" % locals() ) r_counts = com.convert_to_r_dataframe(self.table) r_df = com.convert_robj(transform(r_counts)) # losing rownames for some reason during the conversion?! r_df.index = self.table.index if inplace: self.table = com.convert_robj(r_df) # R replaces "-" in column names with ".". Revert back! self.table.columns = [x.replace(".", "-") for x in self.table.columns] else: tmp_counts = self.clone() tmp_counts.table = com.convert_robj(r_df) tmp_counts.table.columns = [x.replace(".", "-") for x in tmp_counts.table.columns] return tmp_counts
def test_convert_matrix(self): mat = self._test_matrix() converted = com.convert_robj(mat) assert np.array_equal(converted.index, ['a', 'b', 'c']) assert np.array_equal(converted.columns, ['one', 'two', 'three'])
def runComBat(infiles, outfile): # Split infiles vstFile, annotationFile = infiles # Read expression dataframe vstDataframe = pd.read_table(vstFile, index_col='gene_symbol').drop(['B8N', 'B10C'], axis=1) # Read annotation dataframe annotationDataframe = pd.read_table(annotationFile, index_col='sample_name') # Get common samples annotationDataframe = annotationDataframe.loc[vstDataframe.columns] # Run function combatMatrix = r.runComBat(com.convert_to_r_dataframe(vstDataframe), com.convert_to_r_dataframe(annotationDataframe), covariateFormula='~treatment', batchColumn='patient') # Convert to dataframe combatDataframe = com.convert_robj(combatMatrix) # Write file combatDataframe.to_csv(outfile, sep='\t', index_label='gene_symbol')
def test_convert_nested_list(self): obj = r('list(a=list(foo=1, bar=2))') converted = com.convert_robj(obj) expected = {'a': {'foo': [1], 'bar': [2]}} tm.assert_dict_equal(converted, expected)
def test_convert_list(self): obj = r('list(a=1, b=2, c=3)') converted = com.convert_robj(obj) expected = {'a': [1], 'b': [2], 'c': [3]} tm.assert_dict_equal(converted, expected)
def RsoftImpute(self, X): softImpute = importr("softImpute") X = com.convert_to_r_dataframe(X) X = softImpute.complete( X, softImpute.softImpute(softImpute.biScale(X, maxit=100))) X = com.convert_robj(X) return X
def runDESeq(infile, outfiles, outfileRoot): # Report print 'Doing ' + infile + '...' # Read dataframe countDataframe = pd.read_table(infile, index_col='gene_symbol') # Sample counts sampleCounts = collections.Counter( [x.split('-')[-1] for x in countDataframe.columns]) # Make annotation dataframe annotationDataframe = pd.DataFrame.from_dict([{ 'sample_id': x, 'sample_type': x.split('-')[-1] } for x in countDataframe.columns]).set_index('sample_id') # Sample counts sampleCounts = collections.Counter( [x.split('-')[-1] for x in countDataframe.columns]) # Get comparisons comparisons = [ list(x[::-1]) for x in itertools.combinations( [key for key, value in sampleCounts.iteritems() if value >= 5], 2) ] # Loop through comparisons for comparison in comparisons: # Filter annotationDataframeSubset = annotationDataframe[ annotationDataframe['sample_type'].isin(comparison)] countDataframeSubset = countDataframe[annotationDataframeSubset.index] # Run function deseqDataframe = r.runDESeq2( com.convert_to_r_dataframe(countDataframeSubset), com.convert_to_r_dataframe(annotationDataframeSubset), '~ sample_type') # Convert to dataframe deseqDataframe = com.convert_robj(deseqDataframe) # Get comparison string comparisonString = 'v'.join(comparison) # Get outfile outfile = '{outfileRoot}{comparisonString}.txt'.format(**locals()) # Create outdir outDir = os.path.dirname(outfile) if not os.path.exists(outDir): os.makedirs(outDir) # Write deseqDataframe.to_csv(outfile, sep='\t', index_label='gene_symbol')
def get_stats(s): b = base.summary(s) hazard = convert_robj(b.rx2('conf.int')).ix['feature'] stat = pd.Series(b.rx2('logtest'), index=['stat', 'df', 'p']) concordance = pd.Series(b.rx2('concordance'), index=['stat', 'se']) ret = pd.concat([hazard, stat, concordance], keys=['hazard', 'LR', 'concordance']) return ret
def predict(self, xtest): """Predicts class via majority vote. Parameters ---------- xtest : pd.DataFrame features for test set """ if new_pandas_flag: r_xtest = pandas2ri.py2ri(xtest) else: r_xtest = com.convert_to_r_dataframe(xtest) #r_xtest = pandas2ri.py2ri(xtest) pred = self.rf_pred(self.rf, r_xtest) if new_pandas_flag: #py_pred = pandas2ri.ri2py(pred) tmp_genes = pred[1] tmp_pred_class = pred[0] genes = pandas2ri.ri2py(tmp_genes) pred_class = pandas2ri.ri2py(tmp_pred_class) else: py_pred = com.convert_robj(pred) genes, pred_class = zip(*py_pred.items()) #genes = com.convert_robj(tmp_genes) #pred_class = com.convert_robj(tmp_pred_class) tmp_df = pd.DataFrame({'pred_class': pred_class}, index=genes) tmp_df = tmp_df.reindex(xtest.index) tmp_df -= 1 # for some reason the class numbers start at 1 return tmp_df['pred_class']
def ccaPermuteOutcomesVsControls(self, nPerms = 25, penaltyXs = None , penaltyZs = None): (groups, allOutcomes, controls) = self.outcomeGetter.getGroupsAndOutcomes() # groups: set(group_ids) # allOutcomes: {outcome: {group_id: value}} # controls: {control: {group_id: value}} # X contains feature group_norms, Z contains outcome values Zdict = allOutcomes Xdict = controls # R doesn't handle '$'s in column names Xdict = {k.replace('$','.'):v for k, v in Xdict.items()} Zdict = {k.replace('$','.'):v for k, v in Zdict.items()} # X, Z, Xfreqs, Zfreqs = self.prepMatrices(pd.DataFrame(data=Xdict),pd.DataFrame(data=Zdict), softImputeXtoo=True) X, Z, Xfreqs, Zfreqs = self.prepMatricesTogether(pd.DataFrame(data=Xdict), pd.DataFrame(data=Zdict)) try: X = com.convert_to_r_dataframe(X) Z = com.convert_to_r_dataframe(Z) Ngroups = com.convert_robj(ro.r["nrow"](X)[0]) except NameError: warn("pandas.rpy.common cannot be imported") sys.exit(1) kwParams = {"nperms": nPerms} kwParams['penaltyxs'] = penaltyXs if penaltyXs else ro.vectors.FloatVector(np.arange(.1,.91,.05)) kwParams['penaltyzs'] = penaltyZs if penaltyZs else ro.vectors.FloatVector(np.arange(.1,.91,.05)) self._ccaPermute(X,Z, **kwParams)
def forecast(self , resampled_df ,data_freq = 52 , number_of_predctions = 5): # start and end date of the series start_date = pd.to_datetime(resampled_df.ix[0].name).date() end_date = pd.to_datetime(resampled_df.ix[-1].name).date() r_series = self.convert_to_r_series(resampled_df, start_date, data_freq) # fit the model log_r_series = self.base.log(r_series) holt_winter_fit = self.stats.HoltWinters(r_series) # forecast holt_winter_forecast = self.forecast_lib.forecast_HoltWinters(holt_winter_fit , \ h = number_of_predctions) # prepare and convert results to pandas dataframe reshaped_melted_results= self.reshape.melt(holt_winter_forecast) if data_freq == 52: forecast_duration = self.base.as_Date(end_date.strftime('%Y-%m-%d')).ro +\ (self.base.seq(1,number_of_predctions).ro * 7) myxts = self.xts.xts(reshaped_melted_results, forecast_duration) results_field = 'value.value.Point.Forecast' elif data_freq == 12: myxts = holt_winter_forecast results_field = 'value.Point.Forecast' results_pd_df = com.convert_robj(self.r.melt(myxts)) results_pd_ts = results_pd_df[results_field ] return (results_pd_ts ,holt_winter_forecast)
def _cca(self, X, Z, **params): """Given two Pandas dataframes and a set of parameters, performs CCA returns CCA dict (converted from R CCA named list object) """ pma = importr("PMA") # Defaults: kwParams = {"typex": "standard", "typez": "standard", "trace": False, "K": self.numComponents, } kwParams.update(params) if isinstance(X, pd.core.frame.DataFrame): X = com.convert_to_r_dataframe(X) if isinstance(Z, pd.core.frame.DataFrame): Z = com.convert_to_r_dataframe(Z) assert isinstance(X, ro.vectors.DataFrame) and isinstance(Z, ro.vectors.DataFrame), "X, Z need to be either Pandas DataFrames or R dataframes!" assert self.numComponents <= min(len(X.names),len(Z.names)), "Number of components must be smaller than the minimum of columns in each of your matrices" nGroups = com.convert_robj(ro.r["nrow"](X)[0]) print("\tCCA parameters:", kwParams) cca = pma.CCA(X, Z, **kwParams) cca = {k:v for k, v in list(cca.items())} cca['nGroups'] = nGroups return cca
def gt_basic(es, gene_sets, pheno_class_column, model="logistic", permutations=100): """ @param es: Expression set with defined user class in pheno @type es: ExpressionSet @type gene_sets: environment.structures.GeneSets @param pheno_class_column: Column name of target classes in phenotype table @type pheno_class_column: string or None """ GlobalTest.gt_init() dataset = com.convert_to_r_matrix(es.get_assay_data_frame()) response = es.get_pheno_column_as_r_obj(pheno_class_column) genes_in_es = es.get_assay_data_frame().index.tolist() gs_filtered = filter_gs_by_genes(gene_sets.get_gs(), genes_in_es) gt_instance = GlobalTest.gt( response, R.r['t'](dataset), subsets=gs_filtered.to_r_obj(), model=model, permutations=permutations, ) result = gt_instance.do_slot('result') result_df = com.convert_robj(result) return result_df
def ccaPermuteOutcomesVsControls(self, groupFreqThresh = 0, nPerms = 25, penaltyXs = None , penaltyZs = None): (groups, allOutcomes, controls) = self.outcomeGetter.getGroupsAndOutcomes(groupFreqThresh) # groups: set(group_ids) # allOutcomes: {outcome: {group_id: value}} # controls: {control: {group_id: value}} # X contains feature group_norms, Z contains outcome values Zdict = allOutcomes Xdict = controls # R doesn't handle '$'s in column names Xdict = {k.replace('$','.'):v for k, v in Xdict.iteritems()} Zdict = {k.replace('$','.'):v for k, v in Zdict.iteritems()} # X, Z, Xfreqs, Zfreqs = self.prepMatrices(pd.DataFrame(data=Xdict),pd.DataFrame(data=Zdict), softImputeXtoo=True) X, Z, Xfreqs, Zfreqs = self.prepMatricesTogether(pd.DataFrame(data=Xdict), pd.DataFrame(data=Zdict)) X = com.convert_to_r_dataframe(X) Z = com.convert_to_r_dataframe(Z) Ngroups = com.convert_robj(ro.r["nrow"](X)[0]) kwParams = {"nperms": nPerms} kwParams['penaltyxs'] = penaltyXs if penaltyXs else ro.vectors.FloatVector(np.arange(.1,.91,.05)) kwParams['penaltyzs'] = penaltyZs if penaltyZs else ro.vectors.FloatVector(np.arange(.1,.91,.05)) self._ccaPermute(X,Z, **kwParams)
def fit(self, xtrain, ytrain): """The fit method trains R's random forest classifier. NOTE: the method name ("fit") and method signature were choosen to be consistent with scikit learn's fit method. Parameters ---------- xtrain : pd.DataFrame features for training set ytrain : pd.DataFrame true class labels (as integers) for training set """ label_counts = ytrain.value_counts() if self.is_onco_pred and self.is_tsg_pred: sampsize = [label_counts[self.other_num], label_counts[self.onco_num], label_counts[self.tsg_num]] elif self.is_onco_pred: sampsize = [label_counts[self.other_num], label_counts[self.onco_num]] elif self.is_tsg_pred: sampsize = [label_counts[self.other_num], label_counts[self.tsg_num]] self.set_sample_size(sampsize) ytrain.index = xtrain.index # ensure indexes match xtrain['true_class'] = ytrain r_xtrain = com.convert_to_r_dataframe(xtrain) #r_xtrain = pandas2ri.py2ri(xtrain) self.rf = self.rf_fit(r_xtrain, self.ntrees, self.sample_size) r_imp = self.rf_imp(self.rf) # importance dataframe in R self.feature_importances_ = com.convert_robj(r_imp)
def test_convert_matrix(self): mat = self._test_matrix() converted = com.convert_robj(mat) assert np.array_equal(converted.index, ["a", "b", "c"]) assert np.array_equal(converted.columns, ["one", "two", "three"])
def test_convert_nested_list(self): obj = r("list(a=list(foo=1, bar=2))") converted = com.convert_robj(obj) expected = {"a": {"foo": [1], "bar": [2]}} tm.assert_dict_equal(converted, expected)
def test_convert_list(self): obj = r("list(a=1, b=2, c=3)") converted = com.convert_robj(obj) expected = {"a": [1], "b": [2], "c": [3]} tm.assert_dict_equal(converted, expected)
def _cca(self, X, Z, **params): """Given two Pandas dataframes and a set of parameters, performs CCA returns CCA dict (converted from R CCA named list object) """ pma = importr("PMA") # Defaults: kwParams = {"typex": "standard", "typez": "standard", "trace": False, "K": self.numComponents, } kwParams.update(params) if isinstance(X, pd.core.frame.DataFrame): X = com.convert_to_r_dataframe(X) if isinstance(Z, pd.core.frame.DataFrame): Z = com.convert_to_r_dataframe(Z) assert isinstance(X, ro.vectors.DataFrame) and isinstance(Z, ro.vectors.DataFrame), "X, Z need to be either Pandas DataFrames or R dataframes!" assert self.numComponents <= min(len(X.names),len(Z.names)), "Number of components must be smaller than the minimum of columns in each of your matrices" nGroups = com.convert_robj(ro.r["nrow"](X)[0]) print "\tCCA parameters:", kwParams cca = pma.CCA(X, Z, **kwParams) cca = {k:v for k, v in cca.items()} cca['nGroups'] = nGroups return cca
def case_classifyCascade(self): """ A individual case classification function""" ########### To R for classification os.chdir("Z:\Cristina\MassNonmass\codeProject\codeBase\extractFeatures\casesDatabase") cF = pd.read_csv('casesFrames_toclasify.csv') cF['finding.mri_mass_yn'] = cF['finding.mri_mass_yn'].astype('int32') cF['finding.mri_nonmass_yn'] = cF['finding.mri_nonmass_yn'].astype('int32') cF['finding.mri_foci_yn'] = cF['finding.mri_foci_yn'].astype('int32') cF['finding.mri_foci_yn'] = cF['finding.mri_foci_yn'].astype('int32') cF['is_insitu'] = cF['is_insitu'].astype('int32') cF['is_invasive'] = cF['is_invasive'].astype('int32') self.rpycasesFrame = com.convert_to_r_dataframe(cF) base = importr('base') base.source('Z:/Cristina/MassNonmass/codeProject/codeBase/finalClassifier/finalClassifier_classifyCascade.R') RFcascade = globalenv['finalClassifier_classifyCascade'](self.rpycasesFrame) self.RFcascade_probs = com.convert_robj(RFcascade) print "\n========================" print self.RFcascade_probs # proccess possible outcome [veredict, caseoutcome] = self.parse_classes(self.RFcascade_probs) print "\n========================\nCascade classification result:" print veredict print caseoutcome return
def __init__(self, data, p=1, type='both'): self.rdata = data self.p = p self.type = type self.pydata = rpy.convert_robj(data) self._estimate = None self.estimate()
def test_convert_frame(self): # built-in dataset df = r["faithful"] converted = com.convert_robj(df) assert np.array_equal(converted.columns, ["eruptions", "waiting"]) assert np.array_equal(converted.index, np.arange(1, 273))
def test_convert_frame(self): # built-in dataset df = r['faithful'] converted = com.convert_robj(df) assert np.array_equal(converted.columns, ['eruptions', 'waiting']) assert np.array_equal(converted.index, np.arange(1, 273))
def runCharacteristicDirection(infiles, outfile): # Split infiles vstFile, annotationFile = infiles # Read expression data vstDataframe = pd.read_table(vstFile, index_col='gene_symbol') # Read annotation data annotationDataframe = pd.read_table(annotationFile, index_col='sample_name') # Get timepoint samples timepointSampleDict = { 'day' + str(day): annotationDataframe.index[annotationDataframe['day'] == day].tolist() for day in set(annotationDataframe['day']) } # Group 4 and 5 days timepointSampleDict[ 'day4-5'] = timepointSampleDict['day4'] + timepointSampleDict['day5'] del timepointSampleDict['day4'] del timepointSampleDict['day5'] # Get controls controlColumns = timepointSampleDict.pop('day0') # Initialize empty dataframe resultDataframe = pd.DataFrame() # Loop through timepoints for timepoint in timepointSampleDict.keys(): # Get experiment samples experimentColumns = timepointSampleDict[timepoint] # Run characteristic direction cdResults = r.runCharacteristicDirection( com.convert_to_r_dataframe(vstDataframe), experimentColumns, controlColumns, 0.1) # Convert to dataframe cdDataframe = com.convert_robj(cdResults).reset_index() # Add timepoint column cdDataframe['timepoint'] = timepoint # Append resultDataframe = pd.concat([resultDataframe, cdDataframe]) # Pivot resultDataframeCast = resultDataframe.pivot(index='index', columns='timepoint', values='CD') # Save resultDataframeCast.to_csv(outfile, sep='\t', index_label='gene_symbol')
def imputation_loyer(year): erf = create_comparable_erf_data_frame(year) erf = erf[['logt', 'hnph2', 'iaat', 'mdiplo', 'mtybd', 'tu99_recoded', 'magtr', 'mcs8', 'deci', 'wprm', 'ident']] erf = erf.dropna(how = 'any') # TODO : faire un check avant de dropper les lignes avec des NA Logt = create_comparable_logement_data_frame(year) Logt = Logt.dropna(how = 'any') allvars = ['logt', 'hnph2', 'iaat', 'mdiplo', 'mtybd', 'tu99_recoded', 'magtr', 'mcs8', 'deci'] classes = ['magtr', 'tu99_recoded'] matchvars = list(set(allvars) - set(classes)) for variable in allvars: count_NA(variable, Logt) count_NA(variable, erf) erf['mcs8'] = erf['mcs8'].astype(int) rpy2.robjects.pandas2ri.activate() # Permet à rpy2 de convertir les dataframes padas2ri doesn't exist anymore in rpy2 # com.convert_to_r_dataframe() TODO: Probablement à supprimer try: sm = importr("StatMatch") # Launch R you need to have StatMatch installed in R except: sm = importr("StatMatch", lib_loc = STATMATCH_LIB_LOCATION) out_nnd = sm.NND_hotdeck(data_rec = erf, data_don = Logt, match_vars = vectors.StrVector(matchvars), don_class = vectors.StrVector(classes), dist_fun = "Gower", ) fill_erf_nnd = sm.create_fused(data_rec = erf, data_don = Logt, mtc_ids = out_nnd[0], z_vars = vectors.StrVector(["lmlm"]), ) del allvars, matchvars, classes, out_nnd gc.collect() fill_erf_nnd = com.convert_robj(fill_erf_nnd) fill_erf_nnd = DataFrame(fill_erf_nnd) fill_erf_nnd.rename(columns={'lmlm': 'loym'}, inplace = True) loy_imput = fill_erf_nnd[['ident', 'loym']] erfmenm = load_temp(name = "menagem", year = year) for var in ["loym", "loym_x", "loym_y", "loym_z"]: if var in erfmenm: del erfmenm[var] log.info("{} have been deleted".format(var)) erfmenm = erfmenm.merge(loy_imput, on='ident', how='left') assert 'loym' in erfmenm.columns, u"La variable loym n'est pas présente dans erfmenm" save_temp(erfmenm, name = "menagem", year=year)
def convert_xts_to_df(o): """ Will convert xts objects to DataFrame """ dates = o.do_slot('index') dates = np.array(dates, dtype=np.dtype("M8[s]")) res = robjects.default_ri2py(o) df = rcom.convert_robj(res) df.index = dates return df
def sav_to_pandas_rpy2(inputfile): """ :param inputfile: string :return: """ import pandas.rpy.common as com w = com.robj.r('foreign::read.spss("%s", to.data.frame=TRUE)' % inputfile) return com.convert_robj(w)
def load_cv(self, path): set_wd_str = 'setwd("{0}")'.format(os.getcwd()) ro.r(set_wd_str) ro.r('load("{0}")'.format(path)) self.rf_cv = ro.r["trained.models"] if new_pandas_flag: #self.cv_folds = pandas2ri.ri2py(ro.r["cvFoldDf"]) self.cv_folds = ro.r["cvFoldDf"] else: self.cv_folds = com.convert_robj(ro.r["cvFoldDf"])
def sav_to_pandas_rpy2(input_file): """ SPSS .sav files to Pandas DataFrame through Rpy2 :param input_file: string :return: """ import pandas.rpy.common as com w = com.robj.r('foreign::read.spss("%s", to.data.frame=TRUE)' % input_file) return com.convert_robj(w)
def draw_survival_curves_mpl(fit, ax=None, title=None, colors=None, ms=80, alpha=1): """ Takes an R survfit. """ if ax is None: _, ax = plt.subplots(1, 1, figsize=(4, 3)) s = base.summary(fit) tab = pd.DataFrame({v: s.rx2(v) for v in s.names if len(s.rx2(v)) == len(s.rx2('time'))}, index=s.rx2('time')) call = com.convert_robj(fit.rx2('call')[2]) groups = robjects.r.sort(robjects.r.c(*call.feature.unique())) if 'strata' not in tab: groups = [0] tab['strata'] = 1 elif len(tab.strata.unique()) != len(groups): gg = list(call[call.event > 0].feature.unique()) gg = [g for g in groups if g in gg] bg = [g for g in groups if g not in gg] groups = gg + bg for i, group in enumerate(groups): censoring = call[(call.event == 0) & (call.feature == group)].days surv = tab[tab.strata == (i + 1)].surv surv = surv.copy().set_value(0., 1.) surv = surv.sort_index() if surv.index[-1] < censoring.max(): surv = surv.set_value(censoring.max(), surv.iget(-1)).sort_index() censoring_pos = get_markers(censoring, surv) ax.step(surv.index, surv, lw=3, where='post', alpha=alpha, label=group) if colors is not None: try: """fix for R-Python str-to-int conversion""" color = colors[group] except: color = colors[i] ax.lines[-1].set_color(color) if len(censoring_pos) > 0: ax.scatter(*zip(*censoring_pos), marker='|', s=ms, color=ax.lines[-1].get_color()) ax.set_ylim(0, 1.05) # ax.set_xlim(0, max(surv.index)*1.05) ax.set_xlim(0, max(call.days) * 1.05) ax.legend(loc='best') ax.set_ylabel('Survival') ax.set_xlabel('Years') if title: ax.set_title(title)
def unpack_r_results_list(res_list): """Unpacks the results list to a tuple (net_benefit, interventions_avoided) for comparison with the results of Python Transforms the results of the R analysis into the pandas DataFrame format and indexing returned by the Python algorithm Parameters ---------- res_list : rpy2.robject a list of results from an R analysis (returned by the R dca function) Returns ------- tuple(pd.DataFrame, pd.DataFrame) (net_benefit, interventions_avoided) -- same result as Python analysis """ r_nb = pdcom.convert_robj(res_list.rx('net.benefit')) r_nb = r_nb['net.benefit'] #unpack dataFrame from dict r_ia = pdcom.convert_robj(res_list.rx('interventions.avoided')) r_ia = r_ia['interventions.avoided'] return r_nb, r_ia
def runVoom(infile, outfile): # Read expression dataframe rawcountDataframe = pd.read_table(infile, index_col='gene_symbol') # Run function voomMatrix = r.runVoom(com.convert_to_r_dataframe(rawcountDataframe)) # Convert to dataframe voomDataframe = com.convert_robj(voomMatrix) # Write file voomDataframe.to_csv(outfile, sep='\t', index_label='gene_symbol')
def get_surv_fit(surv, feature=None, covariates=None, interactions=None, formula=None, time_cutoff=5): df, factors = process_covariates(surv, feature, covariates) if formula is None: fmla = get_formula(factors, interactions) fmla = robjects.Formula(fmla) else: fmla = robjects.Formula(formula) s = survival.survfit(fmla, df) summary = base.summary(s, times=robjects.r.c(time_cutoff)) res = convert_robj(summary.rx2('table')) if type(res) == list: r = summary.rx2('table') r = pd.Series(r, r.names) res = pd.DataFrame({'feature=all': r}).T res = res.rename(index=lambda idx: idx.split('=')[1]) res = res[['records', 'events', 'median', '0.95LCL', '0.95UCL']] res.columns = pd.MultiIndex.from_tuples([('Stats', '# Patients'), ('Stats', '# Events'), ('Median Survival', 'Median'), ('Median Survival', 'Lower'), ('Median Survival', 'Upper')]) if feature is None: for f in ['surv', 'lower', 'upper']: res[(str(time_cutoff) + 'y Survival', f.capitalize())] = summary.rx2(f) else: idx = map(lambda s: s.replace('feature=', ''), summary.rx2('strata').iter_labels()) df = pd.DataFrame( { d: list(summary.rx2(d)) for d in ['strata', 'surv', 'lower', 'upper'] }, index=idx) for f in ['surv', 'lower', 'upper']: res[(str(time_cutoff) + 'y Survival', f.capitalize())] = df[f] try: res.index = map(int, res.index) except: pass return res
def ccaPermute(self, nPerms=25, penaltyXs=None, penaltyZs=None, controlsWithFeats=False): (groups, allOutcomes, controls) = self.outcomeGetter.getGroupsAndOutcomes() # groups: set(group_ids) # allOutcomes: {outcome: {group_id: value}} # controls: {control: {group_id: value}} (groupNorms, featureNames ) = self.featureGetter.getGroupNormsWithZerosFeatsFirst(groups) Zdict = allOutcomes Xdict = groupNorms if controlsWithFeats: print("Appending controls to X") Xdict.update(controls) else: print("Appending controls to Z") Zdict.update(controls) # TO DO: get topic frequencies? # groupNorms: {feat: {group_id: group_norm}} # featureNames: list of possible feature names # X contains feature group_norms, Z contains outcome values X, Z, Xfreqs, Zfreqs = self.prepMatrices(pd.DataFrame(data=Xdict), pd.DataFrame(data=Zdict)) try: X = com.convert_to_r_dataframe(X) Z = com.convert_to_r_dataframe(Z) Ngroups = com.convert_robj(ro.r["nrow"](X)[0]) except NameError: warn("pandas.rpy.common cannot be imported") sys.exit(1) kwParams = {"nperms": nPerms} kwParams[ 'penaltyxs'] = penaltyXs if penaltyXs else ro.vectors.FloatVector( np.arange(.1, .91, .05)) kwParams[ 'penaltyzs'] = penaltyZs if penaltyZs else ro.vectors.FloatVector( np.arange(.1, .91, .05)) self._ccaPermute(X, Z, **kwParams)
def predict_proba(self, xtest): """Predicts the probability for each class. Parameters ---------- xtest : pd.DataFrame features for test set """ r_xtest = com.convert_to_r_dataframe(xtest) #r_xtest = pandas2ri.ri2py(xtest) pred_prob = self.rf_pred_prob(self.rf, r_xtest) py_pred_prob = com.convert_robj(pred_prob) #py_pred_prob = pandas2ri.ri2py(pred_prob) return py_pred_prob.values