def make_dataframe(data, xid, yid, factorid): ns = [] steps = [] runtimes = [] for (n, v1) in data.items(): for (step, runtime) in v1.items(): ns.append(n) steps.append(step) runtimes.append(runtime) df = ro.DataFrame({xid : ro.IntVector(ns), factorid : ro.StrVector(steps), yid : ro.IntVector(runtimes)}) return df
def calculate(self, msg): save_path, all_predicted_time = msg x, y, _ = plt.hist(list(itertools.chain.from_iterable(all_predicted_time)), bins=5) qt = importr("qualityTools") grdevices = importr('grDevices') ytp = [(y[i]/100, y[i+1]/100) for i in range(len(y)-1)] x_rp = [] for i in range(len(x)): x_rp.extend([str(ytp[i])]*int(x[i])) grdevices.png(file="%s.png" % save_path, width=600, height=400) qt.paretoChart(robjects.StrVector(x_rp), main="predicted_time--"+save_path.basename()) grdevices.dev_off()
def heatmap(Arguments): ''' ''' Data = load_data(Arguments) if Arguments.OrderBy: Data = reorder_data(Data, Arguments) #This is specifically the part you want plotted on the heatmap, NOT the sidebars Features, Variates = get_heatmap_data(Data, Arguments) Features = [Feature[:Feature.index(":")] for Feature in Features] ClusteredMatrix = ro.r["hierarchical_cluster_distance_matrix"]( float_matrix(Data.Labels, Features, Variates, Arguments)) Columns = list(ClusteredMatrix.colnames) Rows = list(ClusteredMatrix.rownames) Matrix = np.array(ClusteredMatrix) ZMatrix = z_matrix(Matrix) ColorPalette = list(ro.r["colorRampPalette"](ro.StrVector( ["blue", "white", "red"]))(100)) ColorMatrix = [float2color(Vector, ColorPalette) for Vector in ZMatrix] exit() Heatmap = float_matrix(Data.Labels, Features, Variates, Arguments) Legend = False #overwrite if ColSideColors is called if "colsidecolors" in [Option.lower() for Option in Arguments.Heatmap]: ColorFeatures, ColSideMatrix, Legend = get_colsidematrix( Data, Arguments) ColSideMatrix = transpose_matrix( string_matrix(Data.Labels, ColorFeatures, ColSideMatrix, Arguments)) else: ColSideMatrix = False if "rowsidecolors" in [Option.lower() for Option in Arguments.Heatmap]: RowSideMatrix = get_rowsidematrix(Features, Arguments) else: RowSideMatrix = False ro.r['pdf'](Arguments.Filename + '.pdf') if Legend: ro.r['par'](mar=ro.IntVector([1, 1, 1, 1])) ro.r['par'](**{'cex.axis': 0.8}) get_heatmap(Heatmap, ColSideMatrix=ColSideMatrix, RowSideMatrix=RowSideMatrix, Legend=Legend) ro.r['dev.off']() return
def get_mail_corpus(nlon_cleaning=False): if (nlon_cleaning): nlon, nlon_model = training_nlon() #Path to mail's corpus corpus_file = 'data/mailcorpus.json' with open(corpus_file) as data_file: corpus = json.load(data_file) print('Reading and cleaning emails corpus. Number of emails: ' + str(len(corpus))) dict = {} n = 0 #Text cleaning for d in corpus: if d['type_of_recipient'] == 'From': # if not d['is_response_of'] == None: res = EmailReplyParser.read(d['message_body'].replace('\\n', '\n')) text = res.reply # else: # text = d['message_body'].replace('\\n', '\n') n += 1 if (nlon_cleaning): try: soup = BS4(text, 'html.parser') clean_message_body = soup.text except Exception as e: print('Error with BS4 on text:\n\n%s\n\n' % text, str(e)) clean_message_body = text.strip() message_by_lines = text.splitlines() list_length = len(message_by_lines) index = 0 for count in range(0, list_length): text1 = robjects.StrVector([message_by_lines[index]]) if nlon.NLoNPredict(nlon_model, text1)[0] == 'Not': del message_by_lines[index] else: index = index + 1 clean_message_body = '\n'.join(message_by_lines) text = clean_message_body if not text == '': if d['email_address'] in dict: dict[d['email_address']].append(text) else: dict[d['email_address']] = [text] print(str(n)+'/'+str(len(corpus))+'\n' if n%50==0 else '', end='') print('Mails retrieved: '+ str(n)) print('Email addresses: '+ str(len(dict))) return dict
def make_rvector(col, ct=COLTYPE.FLOAT): """Make and return an R vector for data in `col` of COLTYPE ct. Returns: robjects.Vector Raises: TypeError if the type is unknown TypeError if it is COLTYPE.DATE but not parseable """ if ct == COLTYPE.INT: vec = robjects.IntVector(col) elif ct == COLTYPE.FLOAT: vec = robjects.FloatVector(col) elif ct == COLTYPE.STR: # Use I() from R.base library to avoid conversion # into a factor. Usually though a factor is what you want. vec = base.I(robjects.StrVector(col)) elif ct == COLTYPE.BOOL: vec = robjects.BoolVector(col) elif ct == COLTYPE.FACTOR: # conversion will happen automatically vec = robjects.StrVector(col) elif ct == COLTYPE.DATE: field = col[0] if isinstance(field, datetime.datetime): tcol = map(datetime_to_sec, col) elif isinstance(field, float): tcol = col else: raise TypeError("Bad date type '%s' for column %d, '%s'. " "Expected time.struct_time, " "datetime.datetime, or float." % ( type(field), i, colnames[i])) vec = robjects.FloatVector(tcol) else: raise TypeError("Unknown type '%s' for column %d, '%s'." % ( type(field), i, colnames[i])) return(vec)
def get_dps(self, entries): """ Wrapper to retrieve dotproducts from DPS database (neuronlistfh) as neuronslist. Parameters ---------- entries : int | str | list thereof, optional Neurons to extract from DPS database. Can be: 1. int: e.g. ``hits=5`` for top 5 hits 2 .list of ints: e.g. ``hits=[2,5]`` to plot hits 2 and 5 3. string: e.g. ``hits = 'THMARCM-198F_seg1'`` to plot this neuron 4. list of strings: e.g. ``['THMARCM-198F_seg1', npfMARCM-'F000003_seg002']`` to plot multiple neurons by their gene name Returns ------- neuronlist of dotproduct neurons """ if isinstance(entries, int): return self.db.rx( robjects.StrVector(self.results.ix[:entries - 1].gene_name.tolist())) elif isinstance(entries, str): return self.db.rx(entries) elif isinstance(entries, (list, np.ndarray)): if isinstance(entries[0], int): return self.db.rx( robjects.StrVector( self.results.ix[entries].gene_name.tolist())) elif isinstance(entries[0], str): return self.db.rx(robjects.StrVector(entries)) else: logger.error('Unable to intepret entries provided. See ' 'help(NBLASTresults.plot3d) for details.') return None
def creatruleset(self, rules): flag = False for rule in rules.keys(): sets = str.split(rule, '=>') robjects.globalenv["l"] = robjects.StrVector( re.findall("[a-z]+", sets[0])) robjects.globalenv["r"] = robjects.StrVector( re.findall("[a-z]+", sets[1])) quality = rules[rule] # robjects.globalenv["q"]=robjects.DataFrame({'support':int(quality['sup']),'confidence':float(quality['conf']),'w_Kulc':float(quality['h_Kulc'])}) robjects.globalenv["q"] = robjects.DataFrame({ 'support': int(quality['sup']), 'confidence': float(quality['conf']), 'lift': float(quality['lift']) }) robjects.r(''' lm<-matrix(1,ncol=length(l)) dimnames(lm)<-list(NULL,l) x<-matrix(0,ncol=length(r)) dimnames(x)<-list(NULL,r) lhs<-as(cbind(lm,x),"itemMatrix") rm<-matrix(1,ncol=length(r)) dimnames(rm)<-list(NULL,r) x<-matrix(0,ncol=length(l)) dimnames(x)<-list(NULL,l) rhs<-as(cbind(x,rm),"itemMatrix") ''') if flag: robjects.r(''' rule<-new("rules",lhs=lhs,rhs=rhs,quality=q) ruleset<-c(ruleset,rule) ''') else: robjects.r('ruleset<-new("rules",lhs=lhs,rhs=rhs,quality=q)') flag = True
def mplotHis(moptions): perclist, rankgrouplist, rankperclist, split_points, myRankStr = group_rank( moptions) figname = moptions["FileID"] mresfolder = moptions['outFolder'] ggplot = importr('ggplot2') importr('gridExtra') spvector = robjects.IntVector(split_points) rankstrvector = robjects.StrVector(myRankStr) moptions['CaseSizes'].sort() csvector = robjects.IntVector(moptions['CaseSizes']) #mdfperc = robjects.DataFrame({"MixedPerc":robjects.FactorVector(robjects.FloatVector(perclist), levels=percvector, labels=percvector), "Rank":robjects.FactorVector(robjects.StrVector(rankgrouplist), levels=rankstrvector, labels=rankstrvector), "Fraction":robjects.FloatVector(rankperclist)}) mdfperc = robjects.DataFrame({ "MixedPerc": robjects.FactorVector(robjects.IntVector(perclist), levels=csvector, labels=csvector), "Percentile": robjects.FactorVector(robjects.StrVector(rankgrouplist), levels=rankstrvector, labels=rankstrvector), "Fraction": robjects.FloatVector(rankperclist) }) robjects.r(resource_string(__name__, 'Rscript/Hist_sim_plot.R')) robjects.r('pdf("' + mresfolder + '/hist2_' + figname + '.pdf", width=' + ("%.0f" % (len(moptions["CaseSizes"]) * 0.8)) + ', height=4, onefile = TRUE)') robjects.globalenv['Hist_sim_plot'](mdfperc, spvector, rankstrvector) robjects.r('dev.off()')
def plot_module_eigengene(self, module): ''' barchart illustrating module eigengene ''' eigengene = self.eigengenes.get_module_eigengene(module) params = {} params['height'] = base().as_numeric(eigengene) limit = max(abs(base().max(eigengene)[0]), abs(base().min(eigengene)[0])) ylim = [-1 * limit, limit] params['ylim'] = ro.IntVector(ylim) colors = ["red" if e[0] > 0 else "blue" for e in eigengene] params['col'] = ro.StrVector(colors) params['border'] = ro.NA_Logical params['las'] = 2 params['names.arg'] = ro.StrVector(self.eigengenes.samples()) params['cex.names'] = 0.6 params['main'] = "Eigengene: " + module manager = RManager(eigengene, params) manager.barchart()
def module_eigengenes(self, membership): ''' wrapper for moduleEigengenes function calculates eigengenes from profiles & module membership (gene -> membership dict) ''' params = {} params['softPower'] = self.params[ 'power'] if 'power' in self.params else 6 params['expr'] = base().as_data_frame(self.transpose_data()) params['colors'] = ro.StrVector(membership) return wgcna().moduleEigengenes(**params)
def _filter_and_values_to_RList(d): """`d` is a dictionary of filters: values. Returns a StrVector and a ListVector of StrVectors""" # Could use ListVector directly with the dict, but want to guarantee # positional order of filters and values f = robjects.StrVector(list(d.keys())) v = robjects.ListVector( rpy2.rlike.container.TaggedList( d.values(), tags=list(d.keys()) ) ) return f, v
def read_target_group_of_interest(targetspath): """ Reads a tsv file, extracts the first column (while assuming they are transcript/gene identifiers) and sets them as a vector in R global environament. """ genes = [] with open(targetspath) as fh: for line in fh: token = line.split("\t") genes.append(token[0].strip()) robjects.globalenv["targetset"] = robjects.StrVector(genes)
def _run(self, cdata): # Execute script using R r_json_arguments = robjects.StrVector([json.dumps(cdata, indent=4)]) r_arguments = importr("rjson").fromJSON(r_json_arguments) source_r_file(self.r_file) result = str(r['do.call']( 'run', r_arguments)).encode("utf-8").decode("unicode_escape") # Interpret results as string match = R_STRING_RE.match(result) if not match: err_msg = "Expected a string as return value from R script. Got: {}" raise ValueError(err_msg.format(result)) return match.groupdict()["str"]
def catch_dataset_ids(self, data_frame, id_label): id_pos = get_item_pos(r['names'](data_frame), id_label) ids_idx = [ item.replace('"', '') for item in list(r['as.vector'](data_frame[id_pos])) ] keep = [ item for item in list(r['names'](data_frame)) if item != id_label ] keep = robjects.StrVector(keep) data_frame_x = data_frame.rx(True, keep) return [data_frame_x, ids_idx]
def genResampleData(infile, outfile): ''' Resample the data n-times with replacement - generates n flat files which are then propagated at later stages. Files are generally small though. ''' time_agg = list(TIME.__dict__['track2groups'].keys()) time_points = [int(str(x).split("-")[1]) for x in time_agg] time_points.sort() time_points = list(set(time_points)) rep_agg = list(REPLICATE.__dict__['track2groups'].keys()) replicates = [str(x).split("-")[2] for x in rep_agg] time_rep_comb = [x for x in itertools.product(time_points, replicates)] time_cond = ro.StrVector([x[0] for x in time_rep_comb]) rep_cond = ro.StrVector([x[1] for x in time_rep_comb]) ref_gtf = str(infile).split("-")[1] condition = (str(infile).split("-")[0]).strip("deseq.dir/") time_points = ",".join([str(i) for i in time_points]) replicates = ",".join(replicates) statement = ''' cgat data2resamples --log=%(outfile)s.log --time=%(time_points)s --replicates=%(replicates)s --condition=%(condition)s --resamples=%(resampling_resample)s --input-gtf=%(ref_gtf)s --output-file-directory=clustering.dir --seed=%(resampling_seed)s %(infile)s ''' P.run() P.touch(outfile)
def computeGOSimilarity(self, funcData): print(funcData) root = etree.fromstring(funcData) rows = root.xpath('//row') drugs=[] drugsToGo={} for row in rows: d = row.xpath('./drug/text()')[0].strip() s = row.xpath('./GO/text()')[0].strip() if d in drugsToGo: l = drugsToGo[d] else: l = [] drugsToGo[d] = l l.append(s) result = '<?xml version="1.0"?>' result += '<data>' compute = robjects.r['mgoSim'] for d1 in drugsToGo.keys(): for d2 in drugsToGo.keys(): print(drugsToGo[d1]) print(drugsToGo[d2]) x = compute(robjects.StrVector(drugsToGo[d1]), robjects.StrVector(drugsToGo[d2]), ont='MF', organism="human", measure="Wang") result += "<row>" result += "<drug1>" + d1 + "</drug1>" result += "<drug2>" + d2 + "</drug2>" result += "<sim>" + str(x[0]) + "</sim>" result += "</row>" result += "</data>" return result
def calculate_degree_modularity(self, targetModule): ''' calculates in degree (kIn) and out degree (kOut) for the target module ''' members = self.__get_module_members(targetModule) degree = rsnippets.degree(self.adjacency, ro.StrVector(members), self.args.edgeWeight) self.modules[targetModule]['kIn'] = int(degree.rx2('kIn')[0]) self.modules[targetModule]['kOut'] = int(degree.rx2('kOut')[0]) size = self.modules[targetModule]['size'] self.modules[targetModule]['density'] = float( self.modules[targetModule]['kIn']) / (float(size) * (float(size) - 1.0) / 2.0)
def build_dataframe(monitor: Monitor) -> dplyr.DataFrame: what = ( 'susceptible', 'incubating', 'sick', ) dataf = dplyr.DataFrame({ 'what': ro.StrVector([v for v in what for x in monitor.day]), 'day': ro.IntVector([v for x in what for v in monitor.day]), 'count': ro.IntVector([v for x in what for v in getattr(monitor, x)]) }) return dataf
def Morris(repet, factors, binf, bsup): """ Simplified import of R'Morris function""" factors = robj.StrVector(factors) binf = numpy.array(binf) bsup = numpy.array(bsup) d = r.list('oat', 5, 3) d = r['names<-'](d, ['type', 'levels', 'grid.jump']) m = r.morris(factors=factors, r=repet, design=d, binf=binf, bsup=bsup) param = r['data.frame'](m.r["X"][0]) pdict = dict( (k, numpy.array(param.r[k][0]).tolist()) for k in r.colnames(param)) l = len(pdict[pdict.keys()[0]]) print 'Computational cost Morris SA is %d' % l return m, pdict
def run(self): from rpy2 import robjects VEGAN_SCRIPT = 'L:/resources/code/models/pre_process/gnn_vegan.r' # Source the gnn_vegan R file robjects.r.source(VEGAN_SCRIPT) # Create an R vector to pass var_vector = robjects.StrVector(self.variables) # Create the vegan file robjects.r.write_vegan(self.method, self.spp_file, self.env_file, var_vector, self.id_field, self.species_transform, self.species_downweighting, self.ord_file)
def barPlot(dict_, keysInOrder=None, printCounts=True, ylim=None, *args, **kwdargs): """ Plot a bar plot Args: dict_: a dictionary of name -> value, where value is the height of the bar use a collections.OrderedDict() to easily convey the order of the groups keysInOrder: an optional ordering of the keys in dict_ (alternate option to using collections.OrderedDict) printCounts: option to print the counts on top of each bar additional kwdargs are passed directly to r.barplot() """ if not keysInOrder: keysInOrder = dict_.keys() heights = ro.FloatVector([dict_[key] for key in keysInOrder]) kwdargs["names.arg"] = ro.StrVector(keysInOrder) if ylim is None: if printCounts: ylim = [min(heights), max(heights) * 1.1] else: ylim = [min(heights), max(heights)] x = r.barplot(heights, ylim=ro.FloatVector(ylim), *args, **kwdargs) if printCounts: heightsStrings = ["{:.2g}".format(height) for height in heights] r.text(x, ro.FloatVector(heights), ro.StrVector(heightsStrings), pos=3) return x
def r_ttest(py_data1, py_data2, mu=0, alt="two.sided"): """ Defined for use within apply_ttest function. """ # Make sure datasets have equal dimensions py_data1, py_data2 = equal_data(py_data1, py_data2) # Bring in t.test function from R t_test = R.r['t.test'] # Convert numpy array to R vector for both datasets data1 = R.FloatVector(py_data1) data2 = R.FloatVector(py_data2) # Perform t-test on two populations test = t_test(data1, data2, mu=mu, **{'paired':True, 'na.action':R.StrVector(("na.exclude",)), 'alternative':R.StrVector((alt,))}) # Index names from R list and report the p-value names = test.names pval = test[names.index('p.value')][0] return pval
def deseq_results(dds, condition1, condition2, out_dir): #Get DESeq2 results. to_dataframe = robjects.r('function(x) data.frame(x)') res = to_dataframe( deseq.results(dds, contrast=robjects.StrVector( ['Group', condition1, condition2]))) gene_ids = res.rownames res = pandas2ri.ri2py_dataframe(res) res.index = gene_ids #Output results. res.to_csv(os.path.join(out_dir, f'{condition1}-{condition2}-results.txt'), sep='\t')
def Morris(repeat, factors, binf, bsup): """ Simplified import of R'Morris function""" factors = robj.StrVector(factors) binf = numpy.array(binf) bsup = numpy.array(bsup) d = r.list('oat', 5, 3) d = r['names<-'](d, ['type', 'levels', 'grid.jump']) m = r.morris(factors=factors, r=repeat, design=d, binf=binf, bsup=bsup) #param=r['data.frame'](m.rx["X"]) param = r['data.frame'](r['$'](m, 'X')) pdict = dict( (k, numpy.array((r['$'](param, k))).tolist()) for k in param.names) #pdict = dict((str(k), list(v)) for k,v in param.iteritems()) return m, pdict
def train_elastic_net_wrapper(features_data_, features_, d_, data_annotation_, x_w=None, prune=True, nested_folds=10): x = numpy.array([features_data_[v] for v in features_.id.values]) dimnames = robjects.ListVector([(1, robjects.StrVector(d_["individual"])), (2, robjects.StrVector(features_.id.values))]) x = robjects.r["matrix"](robjects.FloatVector(x.flatten()), ncol=features_.shape[0], dimnames=dimnames) y = robjects.FloatVector(d_[data_annotation_.gene_id]) nested_folds = robjects.FloatVector([nested_folds]) #py2ri chokes on None. if x_w is None: res = train_elastic_net(y, x, n_train_test_folds=nested_folds) else: res = train_elastic_net( y, x, penalty_factor=x_w, n_train_test_folds=nested_folds ) # observation weights, not explanatory variable weight :( , x_weight = x_w) return pandas2ri.ri2py(res[0]), pandas2ri.ri2py(res[1])
def selectRiverBarPlots(self, filename): riverDict = {} vectorDict = {} if self.selectParm != None: for i in self.selectParm: riverDict = self.countAllRiverParm(i) vectorDict['River'] = robjects.StrVector(riverDict.keys()) vectorDict['Count'] = robjects.IntVector(riverDict.values()) newFilename = filename + "_" + i + "_bar_river" self.barPlot(robjects.DataFrame(vectorDict), newFilename, "River", "Count")
def _generate_table(self, declarations): """Generates an R data frame table from the list of declarations.""" decl_table = defaultdict(list) for decl in declarations: decl_dict = self._map_fields(decl) # R DataFrame is column-major. for k, v in decl_dict.items(): decl_table[k].append(v) return robjects.DataFrame( # Have to translate into a properly typed vector, otherwise R will treat the data in a bad way. { k: (robjects.StrVector(v) if k in STR_COLUMNS else robjects.FloatVector(v)) for k, v in decl_table.items() })
def convert_to_r_dataframe(self, df, strings_as_factors=False): """ Convert a pandas DataFrame to a R data.frame. Parameters ---------- df: The DataFrame being converted strings_as_factors: Whether to turn strings into R factors (default: False) Returns ------- A R data.frame """ import rpy2.rlike.container as rlc columns = rlc.OrdDict() # FIXME: This doesn't handle MultiIndex for column in df: value = df[column] value_type = value.dtype.type if value_type == np.datetime64: value = com.convert_to_r_posixct(value) else: value = [ item if pd.notnull(item) else com.NA_TYPES[value_type] for item in value ] value = com.VECTOR_TYPES[value_type](value) if not strings_as_factors: I = ro.baseenv.get("I") value = I(value) columns[column] = value r_dataframe = ro.DataFrame(columns) del columns r_dataframe.rownames = ro.StrVector(list(df.index)) r_dataframe.colnames = list(df.columns) return r_dataframe
def asVector(val: Iterable, names: Optional[Iterable] = None) -> robj.Vector: val = np.asarray(ll(val)) vect = { 'i': robj.IntVector, 'u': robj.IntVector, 'f': robj.FloatVector, 'b': robj.BoolVector, 'S': robj.StrVector, 'U': robj.StrVector, }.get(val.dtype.kind, lambda x: None)(val) if missing(vect): raise TypeError(f'unknown vector type [{val.dtype.kind}]') if available(names): vect.names = robj.StrVector(np.asarray(ll(names), dtype=str)) return vect
def r_matrix(np_matrix, col_names=None): """Convert a numpymatrix to R matrix. If no columns are provided it will assign the following ['x_1', 'x_2',... ] as column names """ if np_matrix.ndim != 2: msg = 'Input input dimension is %s and MUST be 2' % np_matrix.ndim raise ValueError(msg) n_row, n_col = np_matrix.shape r_mat = robjects.r.matrix(np_matrix, nrow=n_row, ncol=n_col) if col_names is None: col_names = ['x_%s' % (i + 1) for i in range(n_col)] r_mat.colnames = robjects.StrVector(col_names) return r_mat