def __init__(self, formula_str, df, factors=None, resid_formula_str=None, **lmer_opts): """ """ # get the pred_var pred_var = formula_str.split('~')[0].strip() # convert df to a recarray if it's a dataframe if isinstance(df, pd.DataFrame): df = df.to_records() # add column if necessary if pred_var not in df.dtype.names: # must add it df = append_fields(df, pred_var, [0.0] * len(df), usemask=False) # make factor list if necessary if factors is None: factors = {} # add in missingarg for any potential factor not provided for k in df.dtype.names: if isinstance(df[k][0], str) and k not in factors: factors[k] = MissingArg for f in factors: if factors[f] is None: factors[f] = MissingArg # checking for both types of R Vectors for rpy2 variations elif (not isinstance(factors[f], Vector) and not factors[f] == MissingArg): factors[f] = Vector(factors[f]) # convert the recarray to a DataFrame (releveling if desired) self._rdf = DataFrame({ k: (FactorVector(df[k], levels=factors[k]) if (k in factors) or isinstance(df[k][0], str) else df[k]) for k in df.dtype.names }) # get the column index self._col_ind = list(self._rdf.colnames).index(pred_var) # make a formula obj self._rformula = Formula(formula_str) # make one for resid if necessary if resid_formula_str: self._rformula_resid = Formula(resid_formula_str) else: self._rformula_resid = None # save the args self._lmer_opts = lmer_opts # model is null to start self._ms = None
def py2ri_pandasdataframe(obj): od = OrderedDict() for name, values in obj.iteritems(): if values.dtype.kind == 'O': od[name] = StrVector(values) else: od[name] = conversion.py2ri(values) return DataFrame(od)
def py2rpy_pandasdataframe(obj): od = OrderedDict() for name, values in obj.iteritems(): try: od[name] = conversion.py2rpy(values) except Exception as e: warnings.warn('Error while trying to convert ' 'the column "%s". Fall back to string conversion. ' 'The error is: %s' % (name, str(e))) od[name] = StrVector(values) return DataFrame(od)
def read_data(self, file_name, col_from, col_to): RawData = DataFrame(excel.read_excel("../Data/" + file_name + '.xlsx')) NumericData = RawData.rx(True, IntVector(range(col_from, col_to + 1))) MetaData = RawData.rx(True, col_from - 1)[0] RawData._set_rownames(IntVector(range(1, len(MetaData) + 1))) self.file_name = file_name self.raw_data = RawData #print(self.raw_data) self.numeric_data = NumericData self.metadata = r_base.factor(MetaData) self.metadata_list = list(MetaData) self.metabolite_list = list(self.raw_data.names)[1:] self.make_metabolite_dict()
def as_dataframe(table): '''returns a DataFrame instance. Requires counts to be [[col1, col2, col3, ..]]''' data = dict(list(zip(table.header, list(zip(*table.tolist()))))) for column in data: if type(data[column][0]) in (str, str): klass = StrVector else: klass = IntVector data[column] = klass(data[column]) return DataFrame(data)
def pandas2ri(obj): if isinstance(obj, PandasDataFrame): od = OrderedDict() for name, values in obj.iteritems(): if values.dtype.kind == 'O': od[name] = StrVector(values) else: od[name] = pandas2ri(values) return DataFrame(od) elif isinstance(obj, PandasIndex): if obj.dtype.kind == 'O': return StrVector(obj) else: # only other alternative to 'O' is integer, I think, # which goes straight to the numpy converter. return numpy2ri.numpy2ri(obj) elif isinstance(obj, PandasSeries): if obj.dtype == '<M8[ns]': # time series d = [ IntVector([x.year for x in obj]), IntVector([x.month for x in obj]), IntVector([x.day for x in obj]), IntVector([x.hour for x in obj]), IntVector([x.minute for x in obj]), IntVector([x.second for x in obj]) ] res = ISOdatetime(*d) #FIXME: can the POSIXct be created from the POSIXct constructor ? # (is '<M8[ns]' mapping to Python datetime.datetime ?) res = POSIXct(res) else: # converted as a numpy array res = numpy2ri.numpy2ri(obj.values) # "index" is equivalent to "names" in R if obj.ndim == 1: res.do_slot_assign('names', ListVector({'x': pandas2ri(obj.index)})) else: res.do_slot_assign('dimnames', ListVector(pandas2ri(obj.index))) return res else: return original_py2ri(obj)
def aov(matrix, factor_names, measure_name, robj, interactions = '+'): ''' Computes a repeated measures anova in R via the 'aov' command. This function uses R's aov function. It does not compute Greehnhouse-Geisser and Huynh-Feldt corrections. Use lm_anova for this. Input: matrix : ndarray Each dimension of the matrix corresponds to one factor. The first dimension must be (!) the number of subjects. The values in the matrix are taken as the dependent variable. factor_names : list List with names of each factor. The ordering must correspond to the dimensions given by matrix.shape. measure_name : str Name of the dependnent variable. robj : rpy2.robjects instance interactions : str '+' for no interactions '*' for all interactions ''' robj.r('rm(list = ls(all = TRUE)) ') df = make_data_frame(matrix, factor_names, measure = measure_name) robj.globalenv['df'] = DataFrame(df) robj.r('attach(df)') formula = '' error = '' for factor in factor_names: robj.r('%s<-factor(df$%s)'%(factor,factor)) formula = formula + interactions + factor error = error + '*' + factor formula,error = formula[1:], error[1:] formula = 'aov.out <- aov(%s ~ %s + Error(subject/(%s), data=df))'%(measure_name, formula, error) robj.r(formula) print(robj.r('summary(aov.out)')) robj.r('detach(df)')
def __init__( self, fe_formula, re_formula, re_group, dep_data, ind_data, factors=None, row_mask=None, use_ranks=False, use_norm=True, memmap=False, memmap_dir=None, resid_formula=None, null_formula=None, num_null_boot=0, svd_terms=None, use_ssvd=False, #nperms=500, nboot=100, n_jobs=1, verbose=10, lmer_opts=None): """ """ if verbose > 0: sys.stdout.write('Initializing...') sys.stdout.flush() start_time = time.time() # save the formula self._formula_str = fe_formula + ' + ' + re_formula # see if there's a resid formula if resid_formula: # the random effects are the same self._resid_formula_str = resid_formula + ' + ' + re_formula else: self._resid_formula_str = None # see if there's a null formula if null_formula: # the random effects are the same self._null_formula_str = null_formula + ' + ' + re_formula else: self._null_formula_str = None self._num_null_boot = num_null_boot # save whether using ranks self._use_ranks = use_ranks # see whether to use sparse svd self._use_ssvd = use_ssvd # see if memmapping self._memmap = memmap # save job info self._n_jobs = n_jobs self._verbose = verbose # eventually fill the feature shape self._feat_shape = None # fill A,M,O,D self._A = {} self._M = {} self._O = {} self._D = {} O = [] # loop over unique grouping var self._re_group = re_group if isinstance(ind_data, dict): # groups are the keys self._groups = np.array(list(ind_data.keys())) else: # groups need to be extracted from the recarray self._groups = np.unique(ind_data[re_group]) for g in self._groups: # get that subj inds if isinstance(ind_data, dict): # the index is just the group into that dict ind_ind = g else: # select the rows based on the group ind_ind = ind_data[re_group] == g # process the row mask if row_mask is None: # no mask, so all good row_ind = np.ones(len(ind_data[ind_ind]), dtype=np.bool) elif isinstance(row_mask, dict): # pull the row_mask from the dict row_ind = row_mask[g] else: # index into it with ind_ind row_ind = row_mask[ind_ind] # extract that group's A,M,O # first save the observations (rows of A) self._O[g] = ind_data[ind_ind][row_ind] if use_ranks: # loop over non-factors and rank them for n in self._O[g].dtype.names: if (n in factors) or isinstance(self._O[g][n][0], str): continue self._O[g][n] = rankdata(self._O[g][n]) O.append(self._O[g]) # eventually allow for dict of data files for dep_data if isinstance(dep_data, dict): # the index is just the group into that dict dep_ind = g else: # select the rows based on the group dep_ind = ind_ind # save feature shape if necessary if self._feat_shape is None: self._feat_shape = dep_data[dep_ind].shape[1:] # Save D index into data self._D[g] = dep_data[dep_ind][row_ind] # reshape it self._D[g] = self._D[g].reshape((self._D[g].shape[0], -1)) if use_ranks: if verbose > 0: sys.stdout.write('Ranking %s...' % (str(g))) sys.stdout.flush() for i in range(self._D[g].shape[1]): self._D[g][:, i] = rankdata(self._D[g][:, i]) # reshape M, so we don't have to do it repeatedly self._M[g] = self._D[g].copy( ) #dep_data[ind].reshape((dep_data[ind].shape[0],-1)) # normalize M if use_norm: self._M[g] -= self._M[g].mean(0) self._M[g] /= np.sqrt((self._M[g]**2).sum(0)) # determine A from the model.matrix rdf = DataFrame({ k: (FactorVector(self._O[g][k]) if k in factors else self._O[g][k]) for k in self._O[g].dtype.names }) # model spec as data frame ms = r['data.frame'](r_model_matrix(Formula(fe_formula), data=rdf)) cols = list(r['names'](ms)) if svd_terms is None: self._svd_terms = [c for c in cols if not 'Intercept' in c] else: self._svd_terms = svd_terms self._A[g] = np.concatenate( [np.array(ms.rx(c)) for c in self._svd_terms]).T #for c in cols if not 'Intercept' in c]).T if use_ranks: for i in range(self._A[g].shape[1]): self._A[g][:, i] = rankdata(self._A[g][:, i]) # normalize A if True: #use_norm: self._A[g] -= self._A[g].mean(0) self._A[g] /= np.sqrt((self._A[g]**2).sum(0)) # memmap if desired if self._memmap: self._M[g] = _memmap_array(self._M[g], memmap_dir) self._D[g] = _memmap_array(self._D[g], memmap_dir) # concat the Os together and make an LMER instance #O = np.concatenate(O) #self._O = np.vstack(O) #self._O = np.array(O) self._O = O if lmer_opts is None: lmer_opts = {} self._lmer_opts = lmer_opts self._factors = factors #self._lmer = LMER(self._formula_str, O, factors=factors, **lmer_opts) # prepare for the perms and boots self._perms = [] self._boots = [] self._tp = [] self._tb = [] if verbose > 0: sys.stdout.write('Done (%.2g sec)\n' % (time.time() - start_time)) sys.stdout.write('Processing actual data...') sys.stdout.flush() start_time = time.time() global _global_meld _global_meld[id(self)] = self # run for actual data (returns both perm and boot vals) self._R = None self._ss = None self._mer = None self._mer_null = None tp, tb, R, feat_mask, ss, mer, mer_null = _eval_model( id(self), None, None) self._R = R self._tp.append(tp) self._tb.append(tb) self._feat_mask = feat_mask self._ss = ss self._mer = mer self._mer_null = mer_null if verbose > 0: sys.stdout.write('Done (%.2g sec)\n' % (time.time() - start_time)) sys.stdout.flush()
def lmer_feature(formula_str, dat, perms=None, val=None, factors=None, **kwargs): """ Run LMER on a number of permutations of the predicted data. """ # get the perm_var perm_var = formula_str.split('~')[0].strip() # set the val if necessary if not val is None: dat[perm_var] = val # make factor list if necessary if factors is None: factors = [] # convert the recarray to a DataFrame rdf = DataFrame({ k: (FactorVector(dat[k]) if (k in factors) or isinstance(dat[k][0], str) else dat[k]) for k in dat.dtype.names }) #rdf = com.convert_to_r_dataframe(pd.DataFrame(dat),strings_as_factors=True) # get the column index col_ind = list(rdf.colnames).index(perm_var) # make a formula obj rformula = Formula(formula_str) # just apply to actual data if no perms if perms is None: #perms = [np.arange(len(dat))] perms = [None] # run on each permutation tvals = None for i, perm in enumerate(perms): if not perm is None: # set the perm rdf[col_ind] = rdf[col_ind].rx(perm + 1) # inside try block to catch convergence errors try: ms = lme4.lmer(rformula, data=rdf, **kwargs) except: continue #tvals.append(np.array([np.nan])) # extract the result df = r['data.frame'](r_coef(r['summary'](ms))) if tvals is None: # init the data # get the row names rows = list(r['row.names'](df)) tvals = np.rec.fromarrays( [np.ones(len(perms)) * np.nan for ro in range(len(rows))], names=','.join(rows)) tvals[i] = tuple(df.rx2('t.value')) return tvals
def rpy2py_listvector(obj): if 'data.frame' in obj.rclass: res = rpy2py(DataFrame(obj)) else: res = numpy2ri.rpy2py(obj) return res
p.join() times_r.append(res) from rpy2.robjects.vectors import DataFrame, FloatVector, StrVector, IntVector d = {} d['code'] = StrVector([x[0] for x in combos]) + StrVector([x[0] for x in combos_r]) d['sequence'] = StrVector([x[-2] for x in combos]) + StrVector( [x[0] for x in combos_r]) d['time'] = FloatVector([x for x in times]) + FloatVector( [x[0] for x in combos_r]) d['n_loop'] = IntVector([x[-1] for x in combos]) + IntVector( [x[1] for x in combos_r]) d['group'] = StrVector( [d['code'][x] + ':' + d['sequence'][x] for x in xrange(len(d['n_loop']))]) dataf = DataFrame(d) from rpy2.robjects.lib import ggplot2 p = ggplot2.ggplot(dataf) + \ ggplot2.geom_line(ggplot2.aes_string(x="n_loop", y="time", colour="code")) + \ ggplot2.geom_point(ggplot2.aes_string(x="n_loop", y="time", colour="code")) + \ ggplot2.facet_wrap(Formula('~sequence')) + \ ggplot2.scale_y_continuous('running time') + \ ggplot2.scale_x_continuous('repeated n times', ) + \ ggplot2.xlim(0, max(n_loops)) + \ ggplot2.opts(title = "Benchmark (running time)")
def __init__(self, fe_formula, re_formula, re_group, dep_data, ind_data, factors=None, row_mask=None, dep_mask=None, use_ranks=False, use_norm=True, memmap=False, memmap_dir=None, resid_formula=None, svd_terms=None, feat_thresh=0.05, feat_nboot=1000, do_tfce=False, connectivity=None, shape=None, dt=.01, E=2 / 3., H=2.0, n_jobs=1, verbose=10, lmer_opts=None): """ dep_data can be an array or a dict of arrays (possibly memmapped), one for each group. ind_data can be a rec_array for each group or one large rec_array with a grouping variable. """ if verbose > 0: sys.stdout.write('Initializing...') sys.stdout.flush() start_time = time.time() # save the formula self._formula_str = fe_formula + ' + ' + re_formula # see if there's a resid formula if resid_formula: # the random effects are the same self._resid_formula_str = resid_formula + ' + ' + re_formula else: self._resid_formula_str = None # save whether using ranks self._use_ranks = use_ranks # see the thresh for keeping a feature self._feat_thresh = feat_thresh self._feat_nboot = feat_nboot self._do_tfce = do_tfce self._connectivity = connectivity self._dt = dt self._E = E self._H = H # see if memmapping self._memmap = memmap # save job info self._n_jobs = n_jobs self._verbose = verbose # eventually fill the feature shape self._feat_shape = None # handle the dep_mask self._dep_mask = dep_mask # fill A,M,O,D self._A = {} self._M = {} self._O = {} self._D = {} O = [] # loop over unique grouping var self._re_group = re_group if isinstance(ind_data, dict): # groups are the keys self._groups = np.array(ind_data.keys()) else: # groups need to be extracted from the recarray self._groups = np.unique(ind_data[re_group]) for g in self._groups: # get that subj inds if isinstance(ind_data, dict): # the index is just the group into that dict ind_ind = g else: # select the rows based on the group ind_ind = ind_data[re_group] == g # process the row mask if row_mask is None: # no mask, so all good row_ind = np.ones(len(ind_data[ind_ind]), dtype=np.bool) elif isinstance(row_mask, dict): # pull the row_mask from the dict row_ind = row_mask[g] else: # index into it with ind_ind row_ind = row_mask[ind_ind] # extract that group's A,M,O # first save the observations (rows of A) self._O[g] = ind_data[ind_ind][row_ind] if use_ranks: # loop over non-factors and rank them for n in self._O[g].dtype.names: if (n in factors) or isinstance(self._O[g][n][0], str): continue self._O[g][n] = rankdata(self._O[g][n]) O.append(self._O[g]) # eventually allow for dict of data files for dep_data if isinstance(dep_data, dict): # the index is just the group into that dict dep_ind = g else: # select the rows based on the group dep_ind = ind_ind # save feature shape if necessary if self._feat_shape is None: self._feat_shape = dep_data[dep_ind].shape[1:] # handle the mask if self._dep_mask is None: self._dep_mask = np.ones(self._feat_shape, dtype=np.bool) # create the connectivity (will mask later) if self._do_tfce and self._connectivity is None and \ (len(self._dep_mask.flatten()) > self._dep_mask.sum()): # create the connectivity self._connectivity = cluster.sparse_dim_connectivity( [cluster.simple_neighbors_1d(n) for n in self._feat_shape]) # Save D index into data (apply row and feature masks # This will also reshape it self._D[g] = dep_data[dep_ind][row_ind][:, self._dep_mask].copy() # reshape it #self._D[g] = self._D[g].reshape((self._D[g].shape[0], -1)) if use_ranks: if verbose > 0: sys.stdout.write('Ranking %s...' % (str(g))) sys.stdout.flush() for i in xrange(self._D[g].shape[1]): # rank it self._D[g][:, i] = rankdata(self._D[g][:, i]) # normalize it self._D[g][:, i] = ((self._D[g][:, i] - 1) / (len(self._D[g][:, i]) - 1)) # save M from D so we can have a normalized version self._M[g] = self._D[g].copy() # remove any NaN's in dep_data self._D[g][np.isnan(self._D[g])] = 0.0 # normalize M if use_norm: self._M[g] -= self._M[g].mean(0) self._M[g] /= np.sqrt((self._M[g]**2).sum(0)) # determine A from the model.matrix rdf = DataFrame({ k: (FactorVector(self._O[g][k]) if k in factors else self._O[g][k]) for k in self._O[g].dtype.names }) # model spec as data frame ms = r['data.frame'](r_model_matrix(Formula(fe_formula), data=rdf)) cols = list(r['names'](ms)) if svd_terms is None: self._svd_terms = [c for c in cols if 'Intercept' not in c] else: self._svd_terms = svd_terms # self._A[g] = np.vstack([ms[c] #np.array(ms.rx(c)) self._A[g] = np.concatenate( [np.array(ms.rx(c)) for c in self._svd_terms]).T if use_ranks: for i in xrange(self._A[g].shape[1]): # rank it self._A[g][:, i] = rankdata(self._A[g][:, i]) # normalize it self._A[g][:, i] = ((self._A[g][:, i] - 1) / (len(self._A[g][:, i]) - 1)) # normalize A if True: # use_norm: self._A[g] -= self._A[g].mean(0) self._A[g] /= np.sqrt((self._A[g]**2).sum(0)) # memmap if desired if self._memmap: self._M[g] = _memmap_array(self._M[g], memmap_dir, unique_id=str(g)) self._D[g] = _memmap_array(self._D[g], memmap_dir, unique_id=str(g)) # save the new O self._O = O if lmer_opts is None: lmer_opts = {} self._lmer_opts = lmer_opts self._factors = factors # mask the connectivity if self._do_tfce and (len(self._dep_mask.flatten()) > self._dep_mask.sum()): self._connectivity = self._connectivity.tolil()[ self._dep_mask.flatten()][:, self._dep_mask.flatten()].tocoo() # prepare for the perms and boots and jackknife self._perms = [] self._tp = [] self._tb = [] self._tj = [] self._pfmask = [] if verbose > 0: sys.stdout.write('Done (%.2g sec)\n' % (time.time() - start_time)) sys.stdout.write('Processing actual data...') sys.stdout.flush() start_time = time.time() global _global_meld _global_meld[id(self)] = self # run for actual data (returns both perm and boot vals) self._R = None self._ss = None self._mer = None tp, tb, R, feat_mask, ss, mer = _eval_model(id(self), None) self._R = R self._tp.append(tp) self._tb.append(tb) self._feat_mask = feat_mask self._fmask = ~feat_mask[0] self._pfmask.append(~feat_mask[0]) self._ss = ss self._mer = mer if verbose > 0: sys.stdout.write('Done (%.2g sec)\n' % (time.time() - start_time)) sys.stdout.flush()
def _to_pandas_factor(obj): codes = [x - 1 if x > 0 else -1 for x in numpy.array(obj)] res = pandas.Categorical.from_codes(codes, categories=list(obj.do_slot('levels')), ordered='ordered' in obj.rclass) return res converter._rpy2py_nc_map.update({ rinterface.IntSexpVector: conversion.NameClassMap(numpy2ri.rpy2py, {'factor': _to_pandas_factor}), rinterface.ListSexpVector: conversion.NameClassMap(numpy2ri.rpy2py, {'data.frame': lambda obj: rpy2py(DataFrame(obj))}) }) def activate(): warnings.warn( 'The global conversion available with activate() ' 'is deprecated and will be removed in the next ' 'major release. Use a local converter.', category=DeprecationWarning) global original_converter # If module is already activated, there is nothing to do. if original_converter is not None: return original_converter = conversion.Converter(
def test_image_png(): dataf = DataFrame({'x': 1, 'Y': 2}) g = rpy2.robjects.lib.ggplot2.ggplot(dataf) img = ggplot.image_png(g) assert img
def _convert_to_dataframe(x): """ Convert Python list of integers to R data frame. """ tmp = dict() tmp['y'] = IntVector(x) return DataFrame(tmp)