def create_initial_sample(n_obs, dim, type='lhs', lower_bound=None, upper_bound=None): """ Convenient helper function, which creates an initial sample - either based on random (uniform) sampling or using latin hypercube sampling. Args: n_obs: number of observations dim: number of dimensions type: type of sampling strategy (Default value = 'lhs') lower_bound: The lower bounds of the initial sample as a list of size dim (Default value = 0) upper_bound: The upper bounds of the initial sample as a list of size dim (Default value = 1) Returns: numpy array of shape (n_obs x dim) """ if lower_bound is None: lower_bound = [0] * dim if upper_bound is None: upper_bound = [1] * dim pcontrol = { 'init_sample.type': type, 'init_sample.lower': IntVector(lower_bound), 'init_sample.upper': IntVector(upper_bound) } return np.array( flacco.createInitialSample(n_obs, dim, ListVector(pcontrol)))
def sampleSizeRest(): # Get the parsed contents of the form data data = request.json #print(json) k = data["k"].split(',') prev = data["prev"] N = data["N"] unique_id = data["unique_id"] fixed_flag = data["fixed_flag"] sens = data["sens"].split(',') spec = data["spec"].split(',') start = time.time() print "Starting Benchmark" if fixed_flag == "Specificity": jsonrtn = (wrapper.saveAllSensGraphs(IntVector(k), FloatVector(sens), FloatVector(spec), float(prev), IntVector(N), unique_id)) else: jsonrtn = (wrapper.saveAllSpecGraphs(IntVector(k), FloatVector(sens), FloatVector(spec), float(prev), IntVector(N), unique_id)) #end=time.time() #print "Seconds" #print end - start jsonlist = list(jsonrtn) #2 jsonstring = ''.join(jsonlist) print jsonstring return jsonstring
def py2rpy_pandasseries(obj): if obj.dtype.name == 'O': warnings.warn('Element "%s" is of dtype "O" and converted ' 'to R vector of strings.' % obj.name) res = StrVector(obj) elif obj.dtype.name == 'category': res = py2rpy_categoryseries(obj) res = FactorVector(res) elif is_datetime64_any_dtype(obj.dtype): # time series tzname = obj.dt.tz.zone if obj.dt.tz else '' d = [ IntVector([x.year for x in obj]), IntVector([x.month for x in obj]), IntVector([x.day for x in obj]), IntVector([x.hour for x in obj]), IntVector([x.minute for x in obj]), FloatSexpVector([x.second + x.microsecond * 1e-6 for x in obj]) ] res = ISOdatetime(*d, tz=StrSexpVector([tzname])) # TODO: can the POSIXct be created from the POSIXct constructor ? # (is '<M8[ns]' mapping to Python datetime.datetime ?) res = POSIXct(res) elif (obj.dtype == dt_O_type): homogeneous_type = None for x in obj.values: if x is None: continue if homogeneous_type is None: homogeneous_type = type(x) continue if type(x) is not homogeneous_type: raise ValueError('Series can only be of one type, or None.') # TODO: Could this be merged with obj.type.name == 'O' case above ? res = { int: IntVector, bool: BoolVector, None: BoolVector, str: StrVector, bytes: numpy2ri.converter.py2rpy.registry[numpy.ndarray] }[homogeneous_type](obj) else: # converted as a numpy array func = numpy2ri.converter.py2rpy.registry[numpy.ndarray] # current conversion as performed by numpy res = func(obj) if len(obj.shape) == 1: if (obj.dtype != dt_O_type): # force into an R vector res = as_vector(res) # "index" is equivalent to "names" in R if obj.ndim == 1: res.do_slot_assign('names', StrVector(tuple(str(x) for x in obj.index))) else: res.do_slot_assign('dimnames', SexpVector(conversion.py2rpy(obj.index))) return res
def _align_var(breaks_r, pop_col, n, verbose=False): prev_b = -1 i = 1 align = dict() _vector = list() align_t = [1] for e, b in enumerate(breaks_r): if prev_b + 1 == b and b < n + 1: try: assert (min(align_t) != max(align_t)) # align[pop_col + '.' + str(i)] = IntVector( align[pop_col] = IntVector((min(align_t), max(align_t))) # _vector.extend([min(align_t), max(align_t)]) except: if verbose: print("can't allign {} at {} for {}".format(align_t, e, b)) pass i += 1 align_t = [e + 2] else: align_t.append(e + 2) prev_b = b if len(align) == 0: align[pop_col] = IntVector((1, len(breaks_r))) # else: # align[pop_col] = IntVector(_vector) align_r = DataFrame(align) return align_r
def fit(self, x, t, y, refit=False): if self.method_name == "lasso": print("fit lasso") self.model = self.rleaner.rlasso(x, IntVector(t), FloatVector(y)) else: # Takes much longer to fit print("fit boost") self.model = self.rleaner.rboost(x, IntVector(t), FloatVector(y))
def testFunction_select(self): dataf_a = self.DataFrame({ 'x': IntVector((1, 2)), 'y': IntVector((3, 4)) }) dataf_as = dplyr.select(dataf_a, 'y') self.assertEqual(1, dataf_as.collect().ncol) dataf_as = dplyr.select(dataf_a, '-x') self.assertEqual(1, dataf_as.collect().ncol)
def testFunction_group_by_summarize_arrange(self): dataf_a = self.DataFrame({ 'x': IntVector((1, 2, 1)), 'y': IntVector((3, 4, 5)) }) dataf_ag = dplyr.group_by(dataf_a, 'x') dataf_as = dplyr.summarize(dataf_ag, count='n()') dataf_aa = dplyr.arrange(dataf_as, 'count') self.assertEqual(2, dataf_aa.collect().nrow) self.assertSequenceEqual([1, 2], dataf_aa.collect().rx2('count'))
def export_smpl_split_to_r(smpls): n_smpls = len(smpls) all_train = ListVector.from_length(n_smpls) all_test = ListVector.from_length(n_smpls) for idx, (train, test) in enumerate(smpls): all_train[idx] = IntVector(train + 1) all_test[idx] = IntVector(test + 1) return all_train, all_test
def testMethod_select(self): self.DataFrame = self.DataFrame dataf_a = self.DataFrame({ 'x': IntVector((1, 2)), 'y': IntVector((3, 4)) }) dataf_as = dataf_a.select('y') self.assertEqual(1, dataf_as.collect().ncol) dataf_as = dataf_a.select('-x') self.assertEqual(1, dataf_as.collect().ncol)
def spMatrixToR(x): matrix_pkg = rpackages.importr('Matrix') coo_matrix = x.tocoo() numpy2ri.activate() result = matrix_pkg.sparseMatrix(i=IntVector(coo_matrix.row), j=IntVector(coo_matrix.col), x=FloatVector(coo_matrix.data), dims=IntVector(coo_matrix.shape), index1=False) numpy2ri.deactivate() return result
def testMethod_group_by_summarize_arrange(self): self.DataFrame = self.DataFrame dataf_a = self.DataFrame({ 'x': IntVector((1, 2, 1)), 'y': IntVector((3, 4, 5)) }) dataf_ag = dataf_a.group_by('x') dataf_as = dataf_ag.summarize(count='n()') dataf_aa = dataf_as.arrange('count') self.assertEqual(2, dataf_aa.collect().nrow) self.assertSequenceEqual([1, 2], dataf_aa.collect().rx2('count'))
def _extract_mapping(self, cimpl_obj, cis_sites): # Convert CIS sites to frame format. cis_frame = CisSite.to_frame(cis_sites) # Convert to R representation for cimpl. chr_with_prefix = add_prefix(cis_frame['chromosome'], prefix='chr') r_base = importr('base') cis_frame_r = RDataFrame({ 'id': r_base.I(StrVector(cis_frame['id'])), 'chromosome': r_base.I(StrVector(chr_with_prefix)), 'scale': StrVector(cis_frame['scale']), 'start': IntVector(cis_frame['start']), 'end': IntVector(cis_frame['end']) }) cis_frame_r.rownames = StrVector(cis_frame['id']) # Retrieve cis matrix from cimpl. cis_matrix_r = self._cimpl.getCISMatrix(cimpl_obj, cis_frame_r) cis_matrix = dataframe_to_pandas(cis_matrix_r) # Extract scale information from cis matrix. scale_cols = [c for c in cis_matrix.columns if c.startswith('X')] cis_matrix_scales = cis_matrix[['id'] + scale_cols] # Melt matrix into long format. mapping = pd.melt(cis_matrix_scales, id_vars=['id']) mapping = mapping[['id', 'value']] mapping = mapping.rename(columns={ 'id': 'insertion_id', 'value': 'cis_id' }) # Split cis_id column into individual entries (for entries # with multiple ids). Then drop any empty rows, as these # entries are empty cells in the matrix. mapping = mapping.ix[mapping['cis_id'] != ''] mapping = expand_column(mapping, col='cis_id', delimiter='|') mapping_dict = { ins_id: set(grp['cis_id']) for ins_id, grp in mapping.groupby('insertion_id') } return mapping_dict
def cpt_poisson(x, penalty="MBIC", minseglen=2): """changepoint detection with Poisson distribution as test statistic Baseline equaling the smallest non-negative value is remove; negative value is set to a very large RTT, 1e3. Args: x (list of numeric type): timeseries to be handled penalty (string): possible choices "None", "SIC", "BIC", "MBIC", "AIC", "Hannan-Quinn" Returns: list of int: beginning of new segment in python index, that is starting from 0; the actually return from R changepoint detection is the last index of a segment. since the R indexing starts from 1, the return naturally become the beginning of segment. """ x = np.rint(x) try: base = np.min([i for i in x if i > 0]) except ValueError: # if no positive number if x, set base to 0 base = 0 x = [i - base if i > 0 else 1e3 for i in x] return [ int(i) for i in changepoint.cpts( changepoint.cpt_meanvar(IntVector(x), test_stat='Poisson', method='PELT', penalty=penalty, minseglen=minseglen)) ]
def RiverSmooth(dem, direction, river_summary, river_segments, mask=None, bank_epsilon=0.01, river_epsilon=0.0, d4: tuple = (1, 2, 3, 4), printflag=False): if mask is None: mask = RNone d4 = IntVector(d4) results = pf.RiverSmooth(dem=dem, direction=direction, mask=mask, river_summary=river_summary, river_segments=river_segments, bank_epsilon=bank_epsilon, river_epsilon=river_epsilon, d4=d4, printflag=printflag) return _pfprocess(results, ["dem.adj", "processed", "summary"])
def _translate_control(control): """ Transforms a python dict to a valid R object Args: control: python dict Returns: R object of type ListVector """ ctrl = {} for key, lst in control.items(): if isinstance(lst, list): if all(isinstance(n, int) for n in lst): entry = IntVector(control[key]) elif all(isinstance(n, bool) for n in lst): entry = BoolVector(control[key]) elif all(isinstance(n, float) for n in lst): entry = FloatVector(control[key]) elif all(isinstance(n, str) for n in lst): entry = StrVector(control[key]) else: entry = None if entry is not None: ctrl[key] = entry else: ctrl[key] = lst return ListVector(ctrl)
def testMethod_mutate(self): self.DataFrame = self.DataFrame dataf_a = self.DataFrame({'x': IntVector((1, 2))}) dataf_am = dataf_a.mutate(y='x + 3') self.assertEqual(2, dataf_am.ncol) self.assertSequenceEqual([x + 3 for x in dataf_a.collect().rx2('x')], dataf_am.collect().rx2('y'))
def _create_R_dataframe(self, job_ads, include_columns): """Converts job ads to R dataframe. Arguments ---------- job_ads : list[:class:`JobAd`] List of :class:`JobAd` instances. include_columns : list[str] Defines which columns are included in the dataframe. Returns ---------- dataf : :class:`robjects.DataFrame` :class:`robjects.DataFrame` representing job ads. """ #modify structure to type {column:[rows]} if len(job_ads) == 0: raise Exception("No job ads to convert to R dataframe.") job_ads_dataf = {} for column in include_columns: job_ads_dataf[column] = [self._remove_diacritics(ad[column]) for ad in job_ads] if (column == "relevant"): job_ads_dataf[column] = IntVector(job_ads_dataf[column]) else: job_ads_dataf[column] = self._base.I(StrVector(job_ads_dataf[column])) return robjects.DataFrame(job_ads_dataf)
def calc_gini(probes): # Count node occurences. nodes = dict() for probe in probes: if not probe[1] in nodes: nodes[probe[1]] = 1 else: nodes[probe[1]] += 1 if not probe[3] in nodes: nodes[probe[3]] = 1 else: nodes[probe[3]] += 1 # Calculate Gini coefficient. r_stats = importr('stats') total = 0 node_selection = [nodes[node] for node in nodes.iterkeys()] if len(node_selection) == 0: return 1.0 fdata = IntVector(node_selection) Fn = r_stats.ecdf(fdata) for nr in set(node_selection): cdf_x = Fn(nr)[0] total += cdf_x * (1 - cdf_x) return total / mean(node_selection)
def as_data_frame(self, channels, unit): """ Preferred use of the ATF class. Outputs the dataset as a dataframe for further computation. """ unit = NULL if unit is None else unit channels = IntVector(channels) return r["adaptATF"](self._atf, channels=channels, unit=unit)
def pd_ts2r_ts(pd_ts): '''Pandas timeseries (pd_ts) to R timeseries (r_ts) conversion ''' from rpy2.robjects.vectors import IntVector, FloatVector rstats = rpackages.importr('stats') r_start = IntVector( (pd_ts.index[0].year, pd_ts.index[0].month, pd_ts.index[0].day)) r_end = IntVector( (pd_ts.index[-1].year, pd_ts.index[-1].month, pd_ts.index[-1].day)) freq_pandas2r_ts = { # A dictionary for converting pandas.Series frequencies into R ts frequencies 'D': 365, # is this correct, how about leap-years? 'M': 12, 'Y': 1, } r_freq = freq_pandas2r_ts[pd_ts.index.freqstr] result = rstats.ts(FloatVector(pd_ts.values), start=r_start, end=r_end, frequency=r_freq) return result
def py2ri_pandasseries(obj): if obj.dtype.name == 'category': res = py2ri_categoryseries(obj) res = FactorVector(res) elif obj.dtype == dt_datetime64ns_type: # time series d = [ IntVector([x.year for x in obj]), IntVector([x.month for x in obj]), IntVector([x.day for x in obj]), IntVector([x.hour for x in obj]), IntVector([x.minute for x in obj]), IntVector([x.second for x in obj]) ] res = ISOdatetime(*d) #FIXME: can the POSIXct be created from the POSIXct constructor ? # (is '<M8[ns]' mapping to Python datetime.datetime ?) res = POSIXct(res) else: # converted as a numpy array func = numpy2ri.converter.py2ri.registry[numpy.ndarray] # current conversion as performed by numpy res = func(obj) if len(obj.shape) == 1: if (obj.dtype != dt_O_type): # force into an R vector res = as_vector(res) # "index" is equivalent to "names" in R if obj.ndim == 1: res.do_slot_assign('names', StrVector(tuple(str(x) for x in obj.index))) else: res.do_slot_assign('dimnames', SexpVector(conversion.py2ri(obj.index))) return res
def py2ri_pandasseries(obj): if obj.dtype == '<M8[ns]': # time series d = [ IntVector([x.year for x in obj]), IntVector([x.month for x in obj]), IntVector([x.day for x in obj]), IntVector([x.hour for x in obj]), IntVector([x.minute for x in obj]), IntVector([x.second for x in obj]) ] res = ISOdatetime(*d) #FIXME: can the POSIXct be created from the POSIXct constructor ? # (is '<M8[ns]' mapping to Python datetime.datetime ?) res = POSIXct(res) else: # converted as a numpy array res = numpy2ri.numpy2ri(obj.values) # "index" is equivalent to "names" in R if obj.ndim == 1: res.do_slot_assign('names', StrVector(tuple(str(x) for x in obj.index))) else: res.do_slot_assign('dimnames', SexpVector(conversion.py2ri(obj.index))) return res
def create_feature_object(x, y, minimize=True, lower=0, upper=1, blocks=None): """ Creates a FeatureObject which will be used as input for all the feature computations., Args: x: numpy 2D array containing the initial sample y: list containing the objective values of the initial sample minimize: logical variable defining whether the objective is to minimize or not (Default value = True) lower: python list or integer defining the lower limits per dimension (Default value = 0) upper: python list or integer defining the lower limits per dimension (Default value = 1) blocks: number of blocks per dimension (Default value = None) Returns: rpy2.robject """ numpy2ri.activate() x = R.r.matrix(x, nrow=len(x)) numpy2ri.deactivate() y = FloatVector(y) if blocks is None: result = flacco.createFeatureObject(X=x, y=y, minimize=minimize, lower=lower, upper=upper, force=False) else: blocks = IntVector(blocks) if isinstance(blocks, list) else IntVector( [blocks]) result = flacco.createFeatureObject(X=x, y=y, minimize=minimize, lower=lower, upper=upper, blocks=blocks, force=False) return result
def as_data_frame(self, sweep, channels, unit): """ Preferred use of the ABF class. Outputs the dataset as a dataframe for further computation. """ # the only supported `type` argument to `as.data.frame` is "one" # this is intended, we never need other types in our GUI sweep = NULL if sweep is None else sweep unit = NULL if unit is None else unit channels = IntVector(channels) return r["as.data.frame"](self._abf, sweep=sweep, type="one", channels=channels, unit=unit)
def drainageArea(direction, mask=None, d4: tuple = (1, 2, 3, 4), printflag=False): if mask is None: mask = RNone d4 = IntVector(d4) results = pf.drainageArea(direction=direction, mask=mask, d4=d4, printflag=printflag) return _pfprocess(results, ["drainarea"])
def fit(self, x: np.array, t: np.array, y: np.array) -> None: """Fits the forest using factual data""" from rpy2.robjects.vectors import FloatVector, IntVector integer_random_state = int_from_random_state(self.random_state) self.forest = self.grf.causal_forest( x, FloatVector(y), IntVector(t), seed=integer_random_state, num_trees=self.num_trees, **self.kwargs )
def _get_breaks(census_col, verbose=False): """Compute census breaks""" old_var_name = None breaks = list() for e, col in enumerate(census_col): var_name = col.split('_')[0] if var_name != old_var_name: if e - 1 > 0: if verbose: print("adding {} as break = {}".format(var_name, e - 1)) breaks.append(e - 1) old_var_name = var_name breaks = breaks[:-1] breaks_r = IntVector(breaks) return breaks_r
def fit_forecast_model(y, freq, model, **kwargs): """Wrapper of the following flow: - Load _forecast_ package. - Transform data into a ts object. - Fit the model. Parameters ---------- model: str Name of a model included in the _forecast_ package. Ej. 'auto.arima'. freq: int or iterable Frequency of the time series. Can be multiple seasonalities. (Last seasonality considered as frequency.) kwargs: Arguments of the model function. Returns ------- rpy2 object Fitted model """ pandas2ri.activate() freq = deepcopy(freq) if isinstance(freq, int): freq = freq else: freq = IntVector(freq) rstring = """ function(y, freq, ...){ suppressMessages(library(forecast)) y_ts <- msts(y, seasonal.periods=freq) fitted_model<-%s(y_ts, ...) fitted_model } """ % (model) rfunc = robjects.r(rstring) fitted = rfunc(FloatVector(y), freq, **kwargs) return fitted
def InitQueue(dem, initmask=None, domainmask=None, d4: tuple = (1, 2, 3, 4)): # https://github.com/lecondon/PriorityFlow/blob/master/Rpkg/R/Init_Queue.R#L18 if initmask is None: initmask = RNone if domainmask is None: domainmask = RNone d4 = IntVector(d4) results = pf.InitQueue(dem=dem, initmask=initmask, domainmask=domainmask, d4=d4) #return PFQueue(results) return _pfprocess(results, ["mask", "queue", "marked", "basins", "direction"])
def run_boruta(data, target, names, name, outdir): #uruchomienie algorytmu Boruta na data i target grdevices = importr('grDevices') boruta = importr('Boruta') r = robjects.r base = importr('base') data2 = {} for i in xrange(len(names)): data2[names[i]] = FloatVector((data[:,i])) x = robjects.DataFrame(data2) y = IntVector((target)) print "running Boruta" result = boruta.Boruta(x, y) print result print boruta.attStats(result) f = file(outdir+"boruta.data", 'w') pickle.dump(result, f) f.close()