def get_federov_data(self, factors): low_level_limits = IntVector([self.parameter_ranges[f][0] for f in factors]) high_level_limits = IntVector([self.parameter_ranges[f][1] - 1 for f in factors]) factor_centers = IntVector([0 for f in factors]) factor_levels = IntVector([self.parameter_ranges[f][1] for f in factors]) factor_round = IntVector([0 for f in factors]) is_factor = BoolVector([False for f in factors]) mix = BoolVector([False for f in factors]) opt_federov_data = { "var": StrVector(factors), "low": low_level_limits, "high": high_level_limits, "center": factor_centers, "nLevels": factor_levels, "round": factor_round, "factor": is_factor, "mix": mix } opt_federov_dataframe = DataFrame(opt_federov_data) opt_federov_dataframe = opt_federov_dataframe.rx(StrVector(["var", "low", "high", "center", "nLevels", "round", "factor", "mix"])) return opt_federov_dataframe
def qcrop2(xlist, ylist, labels=None, nq=4.): if labels is None: labels = map(str, range(len(xlist))) x = [] y = [] xcrop = [] ycrop = [] facet = [] for i, (onex, oney) in enumerate(zip(xlist, ylist)): xmin, xmax = qlim1(onex, nq) ymin, ymax = qlim1(oney, nq) cropx, cropy = zip(*[( nan, nan) if vy > ymax or vy < ymin or vx < xmin or vx > xmax else (vx, vy) for vx, vy in zip(onex, oney)]) xcrop += cropx ycrop += cropy x += onex y += oney facet += [labels[i]] * len(onex) df = DataFrame({ 'x': FloatVector(x), 'y': FloatVector(y), 'xcrop': FloatVector(xcrop), 'ycrop': FloatVector(ycrop), 'facet': FactorVector(StrVector(facet), levels=StrVector(labels)) }) return df
def bargraph_language(results): r = robjects.r for language in languages: varis = [] probs = [] locs = [] for (lang, prob, var) in results.keys(): if lang == language: loc = results[(lang, prob, var)] varis.append(pretty_varis[var]) probs.append(prob) locs.append(loc) r.pdf('bargraph-loc-lang-' + language + '.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Variation': StrVector(varis), 'Problem': StrVector(probs), 'Lines': IntVector(locs), }) #print (df) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Lines', fill='Variation') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Lines of Code")') pp.plot() r['dev.off']()
def Run(self): self.transit_message("Starting Corrplot") start_time = time.time() # assume first non-comment line is header; samples are headers = None data, means = [], [] if self.filetype == "gene_means": for line in open(self.gene_means): w = line.rstrip().split('\t') if line[0] == '#': headers = w[3:] continue # last comment line has names of samples data.append(w) cnts = [float(x) for x in w[3:]] means.append(cnts) elif self.filetype == "anova" or self.filetype == "zinb": n = -1 # number of conditions for line in open(self.gene_means): w = line.rstrip().split('\t') if line[0] == '#' or ( 'pval' in line and 'padj' in line ): # check for 'pval' for backwards compatibility headers = w continue # keep last comment line as headers if n == -1: # ANOVA header line has names of conditions, organized as 3+2*n+3 (2 groups (means, LFCs) X n conditions) # ZINB header line has names of conditions, organized as 3+4*n+3 (4 groups X n conditions) if self.filetype == "anova": n = int((len(w) - 6) / 2) elif self.filetype == "zinb": n = int((len(headers) - 6) / 4) headers = headers[3:3 + n] headers = [x.replace("Mean_", "") for x in headers] vals = [float(x) for x in w[3:3 + n]] # take just the columns of means qval = float(w[-2]) if qval < 0.05: data.append(w) means.append(vals) else: print("filetype not recognized: %s" % self.filetype) sys.exit(-1) print("correlations based on %s genes" % len(means)) genenames = ["%s/%s" % (w[0], w[1]) for w in data] hash = {} headers = [h.replace("Mean_", "") for h in headers] for i, col in enumerate(headers): hash[col] = FloatVector([x[i] for x in means]) df = DataFrame(hash) # can't figure out how to set rownames corrplotFunc = self.make_corrplotFunc() corrplotFunc( df, StrVector(headers), StrVector(genenames), self.outfile ) # pass headers to put cols in order, since df comes from dict self.finish() self.transit_message("Finished Corrplot")
def _mark_timestamp(self, blSegsL): """ mark segs in final sample """ # 此处应用R来进行求解 # 首先,求解每相邻数据的基线之差的集合 # # 或直接列出所有基线 # 然后,根据相邻数据的基线之差,映射到数据的非基线之上,确定归宿于哪一个 # 基线之差 # # 或找出落入基线之中的最大索引 # 最后,所有的数据点中最先落入基线之差的为目标时间戳 # # 根据该索引作为时间戳 from rpy2.robjects.packages import importr from rpy2.robjects import IntVector, StrVector, globalenv import rpy2.robjects as robjects GR = importr('GenomicRanges') IR = importr('IRanges') GRL = GR.GRangesList() globalenv["GRL"] = GRL for blSegs, idx in zip(blSegsL, range(len(blSegsL))): chromNames = StrVector([seg.chromName for seg in blSegs]) starts = IntVector([seg.start for seg in blSegs]) ends = IntVector([seg.end for seg in blSegs]) tempGR = GR.GRanges(seqnames = chromNames, ranges=IR.IRanges(starts, ends)) globalenv["tempGR"] = tempGR robjects.r("GRL[[{0}]]=tempGR".format(str(idx+1))) GRL = robjects.r["GRL"] # 此处由于list中保存的是指向目标Seg的指针,所以更新nonBLSegs即可 nonBlSegs = list(set(self._segPoolL[-1].segments) - set(blSegsL[-1])) chromNames = StrVector([seg.chromName for seg in nonBlSegs]) starts = IntVector([seg.start for seg in nonBlSegs]) ends = IntVector([seg.end for seg in nonBlSegs]) nonBlGR = GR.GRanges(seqnames = chromNames, ranges=IR.IRanges(starts, ends)) # fo = IR.findOverlaps(nonBlGR, GRL) # For large SCNA fo = IR.findOverlaps(nonBlGR, GRL, minoverlap=5000) globalenv["fo"] = fo robjects.reval("fom <- as.matrix(fo)") overlapIdx = np.array(list(robjects.r.fom)).reshape(tuple(reversed(robjects.r.fom.dim))) - 1 # [[2, 2, 3, 3], # [1, 2, 1, 2]] # print overlapIdx for index in set(overlapIdx[0,]): yIdxes = np.where(overlapIdx[0,]==index)[0] ts = np.max(overlapIdx[1,yIdxes]+1) nonBlSegs[index].tag = str(ts)
def residues_groups(site_type, modified_residues): if site_type.name == 'phosphorylation': return StrVector(['S|T', 'Y']) # TODO: better grouping residues for site-specific enzymes: # for glycosylation there are ~16 enzymes; the idea would be # to load "site" : "terminal sugar" associations (e.g. from O-GlycBase) # and then map "terminal sugar" : "enzyme" for enzymes known to catalyze # glycosylation with given "terminal sugar" (& fro given link type) # Some additional literature review might be needed return StrVector(['|'.join(modified_residues)])
def bargraph_variation_diff(): r = robjects.r for (standard, expert) in [('seq', 'expertseq'), ('par', 'expertpar')]: langs = [] probs = [] diffs = [] for lang in languages: for prob in problems: error = False try: time = result[lang][prob][standard] except KeyError: error = True try: time_expert = result[lang][prob][expert] except KeyError: error = True if not error: diff = (float(time_expert + time) / float(time) - 1) else: diff = 0 langs.append(pretty_langs[lang]) probs.append(prob) diffs.append(diff) r.pdf('bargraph-codingtime-diff-' + standard + '.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Language': StrVector(langs), 'Problem': StrVector(probs), 'Difference': FloatVector(diffs), }) #print (df) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Difference', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('ylab("Coding time difference (in percent)")') +\ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('scale_y_continuous(labels = percent_format())') pp.plot() r['dev.off']()
def __init__(self, item, names=None, verbose=False): """Make data instance from a python or R (rpy2) object input item""" #self._r = item # archive the original data object (but not used) self.verbose = verbose self.dim = () if hasattr(item, 'names'): # store names, colnames, rownames self.names = item.names # as StrVectors if names is not None: self.names = StrListVector(names) if hasattr(item, 'colnames'): self.colnames = item.colnames if hasattr(item, 'rownames'): self.rownames = item.rownames if isinstance(item, (Series, DataFrame)): self.rownames = StrVector(item.index) if isinstance(item, DataFrame): self.colnames = StrVector(item.columns) if isinstance(item, Series) and item.name is not None: self.colnames = StrVector([item.name]) if isinstance(item, (ListVector, ro.vectors.DataFrame, dict)): try: # convert to dict if dict-like (i.e. ListVector, R DataFrame) names = [ self.names[i] if isinstance(self.names, StrVector) else self.names[0][i] for i in range(len(item)) ] except: names = [k for k, v in item.items()] self.iloc = {n: PyR(v) for n, (k, v) in zip(names, item.items())} if verbose: print(f"PyR: dict (len={len(self.iloc)}){type(item)}") else: # not dict-like, so convert to numpy array and apply shape dims self.iloc = np.array(item) if hasattr(item, 'dim'): self.dim = tuple(item.dim) if len(self.dim) > 1: self.iloc = self.iloc.reshape(tuple(item.dim), order='F') self.dim = self.iloc.shape if (not hasattr(self, 'rownames') and len(self.iloc.shape) > 1 and self.names and isinstance(self.names, ListVector)): self.rownames = self.names[0] # try to infer rownames if (not hasattr(self, 'colnames') and len(self.iloc.shape) > 1 and self.names and isinstance(self.names, ListVector)): self.colnames = self.names[1] # try to infer colnames if verbose: print(f"PyR: ndarray {self.iloc.shape} {type(item)}")
def draw_hist(length,pdfname='hist.pdf',b=25,m=700,wd=8,hd=6): #length = d5 # 分区间统计 # for z1, z2 in groupby(sorted(length), key=lambda x: x//5): # print('{}-{}: {}'.format(z1*5, (z1+1)*5-1, len(list(z2)))) # matplotlib 作图 # lenths = array(length) # pyplot.hist(x=lenths,bins=50) # pyplot.xlabel('Sequence Length') # pyplot.xlim(400,500) # pyplot.ylabel('Sequence Number') # pyplot.title('Sequence Length Distribution') # pyplot.show() robjects.globalenv["dd"] = IntVector(length) robjects.globalenv["nm"] = StrVector([pdfname]) robjects.globalenv["b"] = IntVector([b]) robjects.globalenv["m"] = IntVector([m]) robjects.globalenv["wd"] = IntVector([wd]) robjects.globalenv["hd"] = IntVector([hd]) r_script = ''' library(ape) pdf(nm,width=wd,height=hd) xcol=seq(0,m,b) hist(dd,freq=TRUE,breaks=xcol,col='#228B22',xlab='Sequence Length',ylab='Sequence number',main='Distribution of Sequence Length') dev.off() ''' robjects.r(r_script)
def StrListVector(strList): """Convert input to a StrVector, or a ListVector recursively""" try: assert (len(strList) > 0) # NULL, None, '', non-str scalar etc except: return NULL if isinstance(strList, ListVector): # already a ListVector return ListVector(strList) elif isinstance(strList, StrVector): # already a StrVector return StrVector(strList) elif isinstance(strList, str): # str scalar, so apply StrVector return StrVector([strList]) elif any([types.is_list_like(s) for s in strList]): # not the deepest list return ListVector([(None, StrListVector(s)) for s in strList]) else: return StrVector(list(strList)) # is deepest list(-like) of str types
def create_roast_scorer(gene_sets='c2.cp.kegg', id_type='entrez', grouping='by_substance', q_value_cutoff=0.1, na_action='fill_0', cache=True, cache_signatures=False): """Only cache signatures when doing permutations, otherwise it will only slow it down""" importr('limma') importr('Biobase') gene_sets_r = ListVector({ gene_set.name: StrVector(list(gene_set.genes)) for gene_set in db.load(gene_sets=gene_sets, id_type=id_type).gene_sets }) def set_gene_set_collection(): globalenv[gene_sets] = gene_sets_r def roast_score(disease: ExpressionWithControls, compound: ExpressionWithControls): if len(compound.cases.columns) < 2 or len( compound.controls.columns) < 2: print( f'Skipping {compound} not enough degrees of freedom (no way to compute in-group variance)' ) return None if cache: multiprocess_cache_manager.respawn_cache_if_needed() try: disease_gene_sets = roast(disease, gene_sets=gene_sets, use_cache=cache) disease_gene_sets.drop(disease_gene_sets[ disease_gene_sets['fdr_q-val'] > q_value_cutoff].index, inplace=True) signature_gene_sets = roast(compound, gene_sets=gene_sets, use_cache=cache and cache_signatures) joined = combine_gsea_results(disease_gene_sets, signature_gene_sets, na_action) if randint(0, 100) == 1: r('gc()') return joined.score.mean() except RRuntimeError as e: print(e) return None return scoring_function(roast_score, input=ExpressionWithControls, grouping=grouping, before_batch=set_gene_set_collection)
def configure(self, params): super(RPredictor, self).configure(params) if self._positive_class_label is None: self._positive_class_label = ro.rinterface.NULL if self._negative_class_label is None: self._negative_class_label = ro.rinterface.NULL if self._class_labels is None: self._class_labels = ro.rinterface.NULL else: self._class_labels = StrVector(self._class_labels) r_handler.source(R_COMMON_PATH) r_handler.source(R_SCORE_PATH) r_handler.init(self._custom_model_path, self._target_type.value) if self._target_type == TargetType.UNSTRUCTURED: for hook_name in [ CustomHooks.LOAD_MODEL, CustomHooks.SCORE_UNSTRUCTURED, ]: if not hasattr(r_handler, hook_name): raise DrumCommonException( "In '{}' mode hook '{}' must be provided.".format( TargetType.UNSTRUCTURED.value, hook_name)) self._model = r_handler.load_serialized_model(self._custom_model_path)
def c_index_from_r(values, isdead, nbdays, values_test, isdead_test, nbdays_test, isfactor=False): """ """ rob.r('set.seed(2016)') isdead = FloatVector(isdead) isdead_test = FloatVector(isdead_test) nbdays = FloatVector(nbdays) nbdays_test = FloatVector(nbdays_test) values = FloatVector(values) values_test = FloatVector(values_test) if isfactor: values = StrVector(values) values_test = StrVector(values_test) cox = Formula('Surv(nbdays, isdead) ~ values') cox.environment['nbdays'] = nbdays cox.environment['isdead'] = isdead cox.environment['values'] = values res = survival.coxph(cox) frame = rob.r('data.frame') predict = rob.r.predict(res, frame(values=values_test)) concordance_index = rob.r('concordance.index') try: with warnings.catch_warnings(): warnings.simplefilter("ignore") c_index = concordance_index(predict, nbdays_test, isdead_test, method='noether') except Exception as e: print("exception found for c index!: {0}".format(e)) return nan del res, cox, frame return c_index[0][0]
def bargraph_language(cfg, values): r = robjects.r for lang in cfg.languages: times = [] varss = [] probs = [] ses = [] for prob in cfg.problems: for var in cfg.variations: # we use the pretty names to make the varss.append(pretty_varis[var]) probs.append(prob) data = FloatVector(values[prob][var][lang][0]) times.append(r['mean'](data)[0]) t_result = r['t.test'](data, **{ " conf.level": 0.999 }).rx('conf.int')[0] ses.append((t_result[1] - t_result[0]) / 2) r.pdf('bargraph-executiontime-lang-' + lang + '.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Variation': StrVector(varss), 'Problem': StrVector(probs), 'Time': FloatVector(times), 'SE': FloatVector(ses) }) limits = ggplot2.aes(ymax='Time + SE', ymin='Time - SE') dodge = ggplot2.position_dodge(width=0.9) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Time', fill='Variation') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2.geom_errorbar (limits, position=dodge, width=0.25) + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Execution time (in seconds)")') pp.plot() r['dev.off']()
def sites_mutated_ratio(path='static/plot.png', width=1400, height=900, dpi=72, exclude: List[str] = None, glycosylation='together'): from pandas import DataFrame from helpers.ggplot2 import GG from rpy2.robjects.packages import importr from rpy2.robjects import StrVector rows = [] for disorder in [True, False]: for source in source_manager.confirmed: ratios = sites_mutated_ratio_by_type(source.name, disordered=disorder, relative=False, display=False, exclude=exclude, glycosylation=glycosylation) for site_name, percentage in ratios.items(): row = { 'site_type': site_name, 'disordered_region': 'Yes' if disorder else 'No', 'percentage': percentage, 'source': source.name } rows.append(row) df = DataFrame(rows) ggplot2 = importr("ggplot2") theme_options = { 'axis.text.x': ggplot2.element_text(angle=90, hjust=1), 'axis.text': ggplot2.element_text(size=15), 'text': ggplot2.element_text(size=14), 'legend.text': ggplot2.element_text(size=14), 'legend.position': 'bottom' } plot = (GG( ggplot2.ggplot( df, ggplot2.aes_string( x='site_type', y='percentage', fill='disordered_region'))) + ggplot2.geom_bar(stat='identity', position=ggplot2.position_stack(reverse=True)) + ggplot2.facet_grid('~source') + ggplot2.theme(**theme_options) + ggplot2.labs(x='Site type', y=r'Percentage of sites affected by mutations', fill='Is site in disordered region?') + ggplot2.scale_fill_manual( values=StrVector(["#998ec3", "#f1a340"]))) if path: ggplot2.ggsave(str(path), width=width / dpi, height=height / dpi, dpi=dpi, units='in', bg='transparent')
def uninstall_grf(): """ Ensures the grf packages is not installed before the test runs""" if rpackages.isinstalled("grf"): robjects.r.options(download_file_method="curl") utils = rpackages.importr("utils") utils.chooseCRANmirror(ind=0) utils.remove_packages(StrVector(["grf"]))
def python_type_to_R_type(cls, pobject=None): if isinstance(pobject,(list, np.ndarray, pd.Series)): if isinstance(pobject,(list, pd.Series)): pobject = np.array(pobject) if re.match('^int',pobject.dtype.name) is not None: return IntVector(pobject) elif re.match('^float',pobject.dtype.name) is not None: return FloatVector(pobject) elif re.match('^str',pobject.dtype.name) is not None: return StrVector(pobject) elif re.match('^bool',pobject.dtype.name) is not None: return StrVector(pobject) else: return pobject else: return pobject
def install_e1071(): # This only needs to be called once for each package on each machine. # Do it from an interactive session, because it might ask questions. import rpy2.robjects.packages as rpackages from rpy2.robjects import StrVector utils = rpackages.importr('utils') utils.chooseCRANmirror(ind=1) utils.install_packages(StrVector(['e1071']))
def bargraph_language(): r = robjects.r for language in languages: varis = [] probs = [] times = [] for prob in problems: for var in variations: try: time = result[language][prob][var] except KeyError: time = 0 # for the expert times, add expert and non-expert times together if var.startswith('expert'): try: time = time + result[language][prob][var.replace( 'expert', '')] except KeyError: pass varis.append(pretty_varis[var]) probs.append(prob) times.append(time) r.pdf('bargraph-codingtime-lang-' + language + '.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Variation': StrVector(varis), 'Problem': StrVector(probs), 'Time': IntVector(times), }) #print (df) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Time', fill='Variation') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Coding time (in minutes)")') pp.plot() r['dev.off']()
def plot_bar(stats, output_file=None, **kw): names = [r['name'] for r in stats.values()[0][0]] with_rates = [r['with_rate'] for r in stats.values()[0][0]] names = [n + ('+Gamma' if w else '') for n, w in zip(names, with_rates)] by_dir = defaultdict(list) for triad in stats: for r in stats[triad]: by_dir[r[0]['from_directory']].append(r) for d in by_dir: by_dir[d] = zip(*[[gs_p(_r['gs_p']) for _r in r] for r in by_dir[d]]) runs = [] g_stats = [] data = [] alpha = 0 for d, v in by_dir.items(): if 'exons' in d.split('/'): dataset = 'Nuclear' elif 'mtDNA' in d.split('/'): dataset = 'Mitochondrial' else: dataset = 'Microbial' print dataset for j, g in enumerate(v): g_stats += g data += [dataset] * len(g) runs += [j] * len(g) print names[j], sum(1 for _g in g if _g > 0.05) / len(g) alpha = max(alpha, get_alpha(g)) print 'Samples', len(g) labels = 'expression(' + ','.join(names) + ')' df = DataFrame({ 'run': IntVector(runs), 'g_stat': FloatVector(g_stats), 'data': StrVector(data) }) globalenv['df'] = df R('library(scales)') # 'geom_jitter(alpha=0.2, size=1) + ' + \ # 'geom_boxplot(fill=NA, outlier.size=0, size=1.5, color=alpha("white", 0.5)) + ' + \ # 'geom_boxplot(alpha=0.8, outlier.size=0) + ' + \ # 'geom_hline(yintercept=0.05, size=1.5, alpha=0.5, color="white") + ' + \ # 'geom_hline(yintercept=0.05, color="black") + ' + \ cmd = 'gg <- ggplot(df, aes(factor(run), g_stat)) + ' + \ 'ylab("Goodness-of-Fit p-value") + xlab("Model") + ' + \ 'geom_boxplot(outlier.size=1, outlier.colour=alpha("black",'+str(alpha)+')) + ' + \ 'scale_x_discrete(labels=' + labels + ') + ' + \ 'theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5)) + ' + \ 'facet_grid(. ~ data)' R(cmd) if output_file: R('ggsave("' + output_file + '", gg, width=5, height=5)') else: print R['gg'] raw_input('Press Enter to continue...')
def generate_valid_sample(self, sample_size): search_space_dataframe = {} for n in self.axis_names: search_space_dataframe[n] = [] search_space = {} evaluated = 0 info( "Generating valid search space of size {0} (does not spend evaluations)" .format(sample_size)) while len(search_space) < sample_size: candidate_point = self.getRandomCoord() candidate_point_key = str(candidate_point) evaluated += 1 if candidate_point_key not in search_space: perf_params = self.coordToPerfParams(candidate_point) is_valid = eval(self.constraint, copy.copy(perf_params), dict(self.input_params)) if is_valid: search_space[candidate_point_key] = candidate_point for n in perf_params: candidate_value = self.parameter_values[n].index( perf_params[n]) search_space_dataframe[n].append(candidate_value) if len(search_space) % int(sample_size / 10) == 0: info("Valid coordinates: " + str(len(search_space)) + "/" + str(sample_size)) info("Tested coordinates: " + str(evaluated)) if evaluated % 1000000 == 0: info("Tested coordinates: " + str(evaluated)) info("Valid/Tested configurations: " + str(len(search_space)) + "/" + str(evaluated)) for k in search_space_dataframe: search_space_dataframe[k] = IntVector(search_space_dataframe[k]) search_space_dataframe_r = DataFrame(search_space_dataframe) search_space_dataframe_r = search_space_dataframe_r.rx( StrVector(self.axis_names)) info("Generated Search Space:") info(str(self.base.summary_default(search_space_dataframe_r))) coded_search_space_dataframe_r = self.encode_data( search_space_dataframe_r) return coded_search_space_dataframe_r
def __init__(self, ytrue, ypred, cutoff=None, cutoffvariable="threshold"): self.ytrue = ytrue self.ypred = ypred self.aucobj = self._get_auc_obj() self.auc = self.aucobj[8][0] self.ci = rpackage_pROC.ci(self.aucobj, x="best") self.lowci = self.ci[0] self.highci = self.ci[1] self.binary = None if cutoff is None: cutoffmetrics = rpackage_pROC.coords(self.aucobj, "best", cutoffvariable, ret=StrVector([ "threshold", "specificity", "sensitivity", "accuracy", "ppv", "npv" ])) else: cutoffmetrics = rpackage_pROC.coords(self.aucobj, cutoff, cutoffvariable, ret=StrVector([ "threshold", "specificity", "sensitivity", "accuracy", "ppv", "npv" ])) if cutoffvariable == "threshold": binary_pred = [1 if x > cutoff else 0 for x in ypred] self.binary = binary_pred self.threshold = cutoffmetrics[0] self.specificity = cutoffmetrics[1] self.sensitivity = cutoffmetrics[2] self.accuracy = cutoffmetrics[3] self.ppv = cutoffmetrics[4] self.npv = cutoffmetrics[5]
def install_telescope(): ''' This function has not been tested and not guarantee to work! ''' from rpy2.robjects import StrVector utils = rpackages.importr('utils') utils.chooseCRANmirror(ind = 1) pack = ('devtools', 'remotes') utils.install_packages(StrVector(pack)) rpy2.robjects.r('remotes::install_url(url="https://github.com/DescartesResearch/telescope/archive/master.zip", INSTALL_opt= "--no-multiarch")')
def bargraph_variation_diff(cfg, values): r = robjects.r for (standard, expert) in [('seq', 'expertseq'), ('par', 'expertpar')]: langs = [] probs = [] diffs = [] for lang in cfg.languages: for prob in cfg.problems: data = FloatVector(values[prob][standard][lang][0]) data_expert = FloatVector(values[prob][expert][lang][0]) mean = r['mean'](data)[0] mean_expert = r['mean'](data_expert)[0] diff = (float(mean_expert) / float(mean) - 1) langs.append(pretty_langs[lang]) probs.append(prob) diffs.append(diff) r.pdf('bargraph-executiontime-diff-' + standard + '.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Language': StrVector(langs), 'Problem': StrVector(probs), 'Difference': FloatVector(diffs), }) #print (df) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Difference', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Execution time difference (in percent)")') +\ robjects.r('scale_y_continuous(labels = percent_format())') pp.plot() r['dev.off']()
def bargraph_variation_norm(results): r = robjects.r for variation in variations: langs = [] probs = [] locs = [] for problem in problems: results_filtered = { key: results[key] for key in [(lang, problem, variation) for lang in languages] } loc_min = min(results_filtered.values()) for (lang, prob, var) in results_filtered.keys(): loc_norm = (float( results_filtered[(lang, prob, var)])) / float(loc_min) langs.append(pretty_langs[lang]) probs.append(prob) locs.append(loc_norm) r.pdf('bargraph-loc-var-norm-' + variation + '.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Language': StrVector(langs), 'Problem': StrVector(probs), 'Lines': FloatVector(locs), }) #print (df) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Lines', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Lines of Code (normalized to smallest)")') pp.plot() r['dev.off']()
def tupls2RDataframe(data, titles): cols = [[] for _ in titles] for datum in data: for i, e in enumerate(datum): cols[i].append(e) col_d = {} for i, t in enumerate(titles): col_d[t] = StrVector(tuple(cols[i])) col_d[t] = FactorVector(col_d[t]) dataf = DataFrame(col_d) return dataf
def bspl(Pi, Xi, nbas=20, fdn=['Temperature', 'Salinity']): basis = create_basis(Pi[0], Pi[-1], nbas) with localconverter(numpy2ri.converter) as cv: Xi_R = cv.py2rpy(Xi) fdobj = fda.Data2fd(argvals=FloatVector(Pi), y=Xi_R, basisobj=basis, fdnames=StrVector(['Level', 'Station'] + fdn)) size = np.array(fdobj[0]).shape print("{0} B-splines computed for {1} variables.".format(size[1], size[2])) return fdobj
def mem_usage_graph(cfg): r = robjects.r varis = [] langs = [] probs = [] mems = [] for var in cfg.variations: for lang in cfg.languages: for prob in cfg.problems: mem_filename = get_mem_output(lang, prob, var) with open(mem_filename, 'r') as mem_file: mem = mem_file.readline() mems.append(float(mem)) varis.append(pretty_varis[var]) langs.append(pretty_langs[lang]) probs.append(prob) # memory usage is a simple histogram with all information in one graph. r.pdf('bargraph-memusage.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Language': StrVector(langs), 'Problem': StrVector(probs), 'Variation': StrVector(varis), 'Mem': FloatVector(mems) }) gp = ggplot2.ggplot(df) # we rotate the x labels to make sure they don't overlap pp = gp +\ ggplot2.opts (**{'axis.text.x': ggplot2.theme_text (angle = 90, hjust=1)}) + \ ggplot2.aes_string (x='Problem', y='Mem', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2.facet_wrap ('Variation') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Memory usage (in bytes)")')# + \ pp.plot() r['dev.off']()
def bargraph_variation_diff(results): r = robjects.r for (standard, expert) in [('seq', 'expertseq'), ('par', 'expertpar')]: langs = [] probs = [] diffs = [] for lang in languages: for prob in problems: loc = results[(lang, prob, standard)] loc_expert = results[(lang, prob, expert)] diff = (float(loc_expert) / float(loc) - 1) langs.append(pretty_langs[lang]) probs.append(prob) diffs.append(diff) r.pdf('bargraph-loc-diff-' + standard + '.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Language': StrVector(langs), 'Problem': StrVector(probs), 'Difference': FloatVector(diffs), }) #print (df) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Difference', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('ylab("Lines of code difference (in percent)")') +\ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('scale_y_continuous(labels = percent_format())') pp.plot() r['dev.off']()
def ro(self): """Expose a view as RObject, to be manipulated in R environment""" # Convert to R vector of correct data type if isinstance(self.iloc, dict): out = ListVector([(None, PyR(v).ro) for v in self.iloc]) if types.is_float_dtype(self.iloc): out = FloatVector(self.iloc.reshape(-1, order='F')) elif types.is_integer_dtype(self.iloc): out = IntVector(self.iloc.reshape(-1, order='F')) else: out = StrVector(self.iloc.reshape(-1, order='F')) if len(self.dim) > 1: # reshape to R Array if has non-trivial dim out = ro.r.array(out, dim=IntVector(self.dim)) # Collect R object name attributes if hasattr(self, 'rownames'): out.rownames = StrVector(self.rownames) if hasattr(self, 'colnames'): out.colnames = StrVector(self.colnames) if hasattr(self, 'names'): out.names = ListVector(self.names) if isinstance( self.names, ListVector) else StrVector(self.names) return out