def predict(self, X=None): """ Predicts values for the given input features Args: n_trees (int): Number of trees to train within the forest. max_features (int or None): The number of features to consider when looking for the best split. Translated to mtry for the underlying R call. If None, then max_features=n_features. maxsurrogates (int or None):number of surrogate splits to evaluate. Note the currently only surrogate splits in ordered covariables are implemented. If None, all possible surrogates will be considered. debug (bool): If true, display the types of the R dataframe as converted within the fit method. Returns: bool: The return value. True for success, False otherwise. """ if self.r_model is None: raise Exception('Model must be trained first.') if X is None: r_rtn = r.predict(self.r_model, OOB=True) else: r_X = pandas2ri.py2ri(X) r_rtn = r.predict(self.r_model, newdata=r_X) if self.forest_type_is_classification: rtn = np.asarray([r_rtn.levels[r_rtn[x] - 1] for x in r_rtn]) else: rtn = np.asarray([x for x in r_rtn]) return np.asarray(rtn)
def method_spline(rvar, train, test): """ B-splines with interaction """ print("Splines") formula = rvar + ' ~ bs(OverallRank, df=6) + treat + '\ 'treat:bs(OverallRank, df=6) - 1' if rvar == 'Tuition': formula = formula + ' + year' model = r.lm(formula, data=train) #print(r.summary(model).rx2('coefficients')) print(r.summary(model).rx2('r.squared')) #print(r.summary(model)) analytics(rvar, 'Training', train[rvar], np.array(r.predict(model))) if rvar != "UndergraduatemedianGPA": analytics(rvar, 'Testing', test[rvar], np.array(r.predict(model, newdata=test))) print()
def R_run_loess(x, y, span=0.75): """ Predict y as function of x. Takes two numpy vectors. """ # Ensure that Inf/-Inf values are substituted x[utils.where_null(x)] = robj.NA_Real y[utils.where_null(x)] = robj.NA_Real data = robj.DataFrame({"x": x, "y": y}) loess_fit = r.loess("y ~ x", data=data, span=span, family="symmetric") correction_factor = np.array(list(r.predict(loess_fit, x))) corrected_y = \ np.array(list(y)) - correction_factor return corrected_y, correction_factor
def get_team_salary(): conference = request.args.get("conference") division = request.args.get("division") qb = int(request.args.get("qb")) my_test = r['data.frame']( CONF=conference,DIV=division,QB=qb) print(my_test) salary = r.predict(my_model, my_test) print(salary) return jsonify({ 'conference': conference, 'division': division, 'qb': qb, 'salary': salary[0] })
def run_loess(x, y, span=0.75): """ Predict y as function of x. Takes two numpy vectors. """ # Ensure that Inf/-Inf values are substituted x[where_na_like(x)] = robj.NA_Real y[where_na_like(x)] = robj.NA_Real data = robj.DataFrame({"x": x, "y": y}) loess_fit = r.loess("y ~ x", data=data, span=span, family="symmetric") correction_factor = np.array(list(r.predict(loess_fit, x))) corrected_y = \ np.array(list(y)) - correction_factor return corrected_y, correction_factor
def rloess_smooth(xdata, ydata, xfit, span=0.07, deg=2): """ Use rpy2 to call R's loess function from Pthon Input: x, y numpy array of data points xfit, numpy array of data at which yfit it to be computed lspan, span of the fitting ldeg, deg of the polynomial used for fitting Output: x, yfit numpy array of x and smoothed y """ # x = ro.FloatVector(list(xdata)) # y = ro.FloatVector(list(ydata)) # x_fit = ro.FloatVector(list(xfit)) rdf = ro.DataFrame({"x": xdata, "y": ydata}) loess_fit = r.loess('y ~ x', data=rdf, span=span, deg=deg) yfit = np.array(list(r.predict(loess_fit, xfit))) return yfit
def meanVar(_files, _gff_file , _output): NFILE=len(_files) if NFILE == 1: sys.stderr.write("Need at least two samples for each group.\n") sys.exit(1) ##### _dict_counts = dict() ## dictionary of gene counts _genes = HTSeq.GenomicArrayOfSets("auto",stranded=False) idx=0 if MODE == "all-genes": for feature in _gff_file: if feature.type in GENE: _dict_counts[ feature.name ] = [0]*NFILE _genes[feature.iv] += feature.name if feature.type in TX: if feature.attr["geneID"] not in _dict_counts: _dict_counts[feature.attr["geneID"]] = [0]*NFILE _genes[feature.iv] += feature.attr["geneID"] if MODE == "AS-genes": ## Bug: Does not report last gene in gff if it has at least two transcript transcript= set() cur_line = None last_gene_id = None for feature in _gff_file: if feature.type in GENE: if len(transcript) >1: _dict_counts[ cur_line.name ] = [0]*NFILE _genes[cur_line.iv] += cur_line.name cur_line = feature transcript.clear() if feature.type in TX: key = None if "geneID" in feature.attr: key = "geneID" elif "Parent" in feature.attr: key = "Parent" else: sys.stderr.write("transcript line does not have Parent or geneID field\n") if last_gene_id == feature.attr[key]: transcript.add(feature.attr["ID"]) else: if len(transcript) > 1: if feature.attr[key] not in _dict_counts: _dict_counts[feature.attr[key]] = [0]*NFILE _genes[feature.iv] += feature.attr[key] transcript.clear() transcript.add(feature.attr["ID"]) last_gene_id = feature.attr[key] if feature.type in EXON: transcript.add(feature.attr["Parent"]) print "num of genes to simulate: ", len(_dict_counts) _file_raw_count = open(_output+'.rawcounts','w') _file_nb_count = open(_output+'.nbcounts','w') ## This loop read through the input list and call countSam for each input file for f in _files: sam_file=HTSeq.SAM_Reader(f) _dict_counts=countSam(sam_file, _genes,_dict_counts, idx) f.close() idx += 1 sys.stderr.write("library %d has generated.\n" % idx) ## Print raw counts in file specified by <out> for key, value in sorted(_dict_counts.iteritems()): _file_raw_count.write(key+"\t"+"\t".join(map(str,value))+"\n") _file_raw_count.close() ## calculate group mean and variance list_mean = list() list_var = list() for key, value in sorted(_dict_counts.iteritems()): list_mean.append(np.mean(np.array(value))) list_var.append(np.var(np.array(value))) ## computer loess esimates ## The following code is using rpy2 module a = robjects.FloatVector(list_mean) b = robjects.FloatVector(list_var) df = robjects.DataFrame({"mean": a, "var": b}) non0_df=df.rx(df.rx2("mean").ro > 0, True) ## subsetting if mean > 0 loess_fit = r.loess("var ~ mean", data=non0_df, degree=2) ''' #good-of-fit test: variance=r.predict(loess_fit, 1000) print variance[0] print (1000*1000)/(variance[0]-1000) ''' var_pred = r.predict(loess_fit, a) # This loop overwrite global variable dict_counts for recoding new count data count_idx = 0 for key, value in sorted(_dict_counts.iteritems()): n = math.pow(list_mean[count_idx],2)/(var_pred[count_idx]-list_mean[count_idx]) n = int(n) # n: number of failures if n<=0: _dict_counts[key] = [0]*NREPS else: p = n/float(n+list_mean[count_idx]) # p: prob of success _dict_counts[key] = nbinom.rvs(n, p, size=NREPS).tolist() count_idx += 1 #var_pred = r.predict(loess_fit, a) for key, value in sorted(_dict_counts.iteritems()): _file_nb_count.write(key+"\t"+"\t".join(map(str,value))+"\n") _file_nb_count.close() _file_raw_count.close() return _dict_counts
# import r package from rpy2.robjects import r from rpy2.robjects.packages import importr stats = importr("stats") base = importr('base') # ask R to read csv data team_data = r['read.csv']('data/nfl-teams.csv') print (team_data) # Do linear regression on categorical data my_model = stats.lm("TOTAL ~ .", data=team_data) print (base.summary(my_model)) # Get input from the user conference, division, qb = input("Enter conference, division and QB salary ").split() print (conference, division, qb) my_test = r['data.frame']( CONF=conference,DIV=division,QB=int(qb)) print(my_test) print (r.predict(my_model,my_test))
def AFS(order, a=6, q=0.95, d=0.25): # Default value of q and d are 0.95 and 0.25. # Change the column names and format of the dataset. order.columns = ["wv", "intens"] # n records the number of pixels. n = order.shape[0] # ref is a pandas series recording wavelength ref = order["wv"] # Variable u is the parameter u in the step 1 of AFS algorithm. It scales the intensity vector. u = (ref.max() - ref.min()) / 10 / order["intens"].max() order["intens"] = order["intens"] * u # Let alpha be 1/6 of the wavelength range of the whole order. alpha = (order["wv"].max() - order["wv"].min()) / a # This chunk of code detects loops in the boundary of the alpha shape. # Ususally there is only one loop(polygon). # Variable loop is a list. # The indices of the k-th loop are recorded in the k-th element of variable loop. loops = [] # Variable points is a list that represents all the sample point (lambda_i,y_i) points = [(order["wv"][i], order["intens"][i]) for i in range(order.shape[0])] #tl=time() alpha_shape = alphashape.alphashape(points, 1 / alpha) #th=time() # print("alphashape function takes ", th-tl) # Input Vairables: # polygon: shapely polygon object # return Variable: # variable indices is a list recording the indices of the vertices in the polygon def find_vertices(polygon): coordinates = list(polygon.exterior.coords) return [ ref[ref == coordinates[i][0]].index[0] for i in range(len(coordinates)) ] # if alpha_shape is just a polygon, there is only one loop # if alpha_shape is a multi-polygon, we interate it and find all the loops. if (isinstance(alpha_shape, shapely.geometry.polygon.Polygon)): temp = find_vertices(alpha_shape) loops.append(temp) else: for polygon in alpha_shape: temp = find_vertices(polygon) loops.append(temp) # Use the loops to get the set W_alpha. # Variable Wa is a vector recording the indices of points in W_alpha. Wa = [0] for loop in loops: temp = loop temp = loop[:-1] temp = [i for i in temp if (i < n - 1)] max_k = max(temp) min_k = min(temp) len_k = len(temp) as_k = temp if ((as_k[0] == min_k and as_k[len_k - 1] == max_k) == False): index_max = as_k.index(max_k) index_min = as_k.index(min_k) if (index_min < index_max): as_k = as_k[index_min:(index_max + 1)] else: as_k = as_k[index_min:] + as_k[0:(index_max + 1)] Wa = Wa + as_k Wa.sort() Wa = Wa[1:] # AS is an n by 2 matrix recording tilde(AS_alpha). Each row is the wavelength and intensity of one pixel. AS = order.copy() for i in range(n - 1): indices = [m for m, v in enumerate(Wa) if v > i] if (len(indices) != 0): index = indices[0] a = Wa[index - 1] b = Wa[index] AS["intens"][i] = AS["intens"][a] + ( AS["intens"][b] - AS["intens"][a]) * ( (AS["wv"][i] - AS["wv"][a]) / (AS["wv"][b] - AS["wv"][a])) else: # AS=AS.drop(list(range(i, n))) break # Run a local polynomial on tilde(AS_alpha), as described in step 3 of the AFS algorithm. # Use the function loess_1d() to run a second order local polynomial. # Variable y_result is the predicted output from input x x = AS["wv"].values y = AS["intens"].values # covert x and y to R vectors x = robjects.FloatVector(list(x)) y = robjects.FloatVector(list(y)) df = robjects.DataFrame({"x": x, "y": y}) # run loess (haven't found a way to specify "control" parameters) loess_fit = r.loess("y ~ x", data=df, degree=2, span=d, surface="direct") B1 = r.predict(loess_fit, x) # Add a new column called select to the matrix order. # order["select"] records hat(y^(1)). select = order["intens"].values / B1 order["select"] = select # Make indices in Wa to the format of small windows. # Each row of the variable window is a pair of neighboring indices in Wa. window = np.column_stack((Wa[0:len(Wa) - 1], Wa[1:])) # This chunk of code select the top q quantile of points in each window. # The point indices are recorded in variable index, which is S_alpha, q in step 4 # of the AFS algorithm. index = [0] for i in range(window.shape[0]): loc_window = window[i, ] temp = order.loc[loc_window[0]:loc_window[1]] index_i = temp[temp["select"] >= np.quantile(temp["select"], q)].index index = index + list(index_i) index = np.unique(index[1:]) index = np.sort(index) # Run Loess for the last time x_2 = order.iloc[index]["wv"].values y_2 = order.iloc[index]["intens"].values x_2 = robjects.FloatVector(list(x_2)) y_2 = robjects.FloatVector(list(y_2)) df2 = robjects.DataFrame({"x_2": x_2, "y_2": y_2}) loess_fit2 = r.loess("y_2 ~ x_2", data=df2, degree=2, span=d, surface="direct") y_final = r.predict(loess_fit2, x) # Return the blaze-removed spectrum. result = order["intens"].values / y_final return result
# <headingcell level=4> # Using non-base packages in Rpy2 # <codecell> import rpy2.robjects as robjects from rpy2.robjects.packages import importr r = robjects.r e1071 = importr('e1071') Yr = np2r(iris['Type']) Yr = r.factor(Yr) svm = e1071.svm(Xr, Yr) yhat = r.predict(svm, Xr) print r.table(yhat, Yr) # <headingcell level=4> # ggplot2 in python with Rpy2 # <markdowncell> # Thanks to [Fei Yu](http://www.thefeiyu.com/) for this vignette. # <codecell> import rpy2.robjects as robjects from rpy2.robjects.packages import importr
def spline_est(data, new_data): """ Estimate conditional b-splines for value function """ model = r.lm('val ~ bs(OverallRank, df=4)', data=data) return r.predict(model, newdata=new_data)
def ALSFS(order, led, a=6, q=0.95, d=0.25): pd.options.mode.chained_assignment = None # Default value of q and d are 0.95 and 0.25. # Change the column names and format of the dataset. order.columns = ["wv", "intens"] # n records the number of pixels. n = order.shape[0] ref = order["wv"] # Variable u is the parameter u in the step 1 of AFS algorithm. It scales the intensity vector. u = (ref.max() - ref.min()) / 10 / order["intens"].max() order["intens"] = order["intens"] * u # Let alpha be 1/6 of the wavelength range of the whole order. alpha = (order["wv"].max() - order["wv"].min()) / a # This chunk of code detects loops in the boundary of the alpha shape. # Ususally there is only one loop(polygon). # Variable loop is a list. # The indices of the k-th loop are recorded in the k-th element of variable loop. loops = [] # Variable points is a list that represents all the sample point (lambda_i,y_i) points = [(order["wv"][i], order["intens"][i]) for i in range(order.shape[0])] #t1=time() alpha_shape = alphashape.alphashape(points, 1 / alpha) #t2=time() #print('alphashape function takes') #print(t2-t1) # Input Vairables: # polygon: shapely polygon object # return Variable: # variable indices is a list recording the indices of the vertices in the polygon def find_vertices(polygon): coordinates = list(polygon.exterior.coords) return [ ref[ref == coordinates[i][0]].index[0] for i in range(len(coordinates)) ] # if alpha_shape is just a polygon, there is only one loop # if alpha_shape is a multi-polygon, we interate it and find all the loops. if (isinstance(alpha_shape, shapely.geometry.polygon.Polygon)): temp = find_vertices(alpha_shape) loops.append(temp) else: for polygon in alpha_shape: temp = find_vertices(polygon) loops.append(temp) # Use the loops to get the set W_alpha. # Variable Wa is a vector recording the indices of points in W_alpha. Wa = [0] for loop in loops: temp = loop temp = loop[:-1] temp = [i for i in temp if (i < n - 1)] max_k = max(temp) min_k = min(temp) len_k = len(temp) as_k = temp if ((as_k[0] == min_k and as_k[len_k - 1] == max_k) == False): index_max = as_k.index(max_k) index_min = as_k.index(min_k) if (index_min < index_max): as_k = as_k[index_min:(index_max + 1)] else: as_k = as_k[index_min:] + as_k[0:(index_max + 1)] Wa = Wa + as_k Wa.sort() Wa = Wa[1:] # AS is an n by 2 matrix recording tilde(AS_alpha). Each row is the wavelength and intensity of one pixel. AS = order.copy() for i in range(n - 1): indices = [m for m, v in enumerate(Wa) if v > i] if (len(indices) != 0): index = indices[0] a = Wa[index - 1] b = Wa[index] AS["intens"][i] = AS["intens"][a] + ( AS["intens"][b] - AS["intens"][a]) * ( (AS["wv"][i] - AS["wv"][a]) / (AS["wv"][b] - AS["wv"][a])) else: # AS=AS.drop(list(range(i, n))) break # Run a local polynomial on tilde(AS_alpha), as described in step 3 of the AFS algorithm. # Use the function loess_1d() to run a second order local polynomial. # Variable y_result is the predicted output from input x x = AS["wv"].values y = AS["intens"].values # covert x and y to R vectors x = robjects.FloatVector(list(x)) y = robjects.FloatVector(list(y)) df = robjects.DataFrame({"x": x, "y": y}) # run loess (haven't found a way to specify "control" parameters) loess_fit = r.loess("y ~ x", data=df, degree=2, span=d, surface="direct") #wv_vec= robjects.FloatVector(list(order["wv"])) B1 = r.predict(loess_fit, x) # Add a new column called select to the matrix order. # order["select"] records hat(y^(1)). select = order["intens"].values / B1 order["select"] = select # Calculate Q_2q-1 in step 3 of the ALSFS algorithm. Q = np.quantile(order["select"], 1 - (1 - q) * 2) # Make indices in Wa to the format of small windows. # Each row of the variable window is a pair of neighboring indices in Wa. window = np.column_stack((Wa[0:len(Wa) - 1], Wa[1:])) # This chunk of code select the top q quantile of points in each window. # The point indices are recorded in variable index, which is S_alpha, q in step 4 # of the AFS algorithm. index = [0] for i in range(window.shape[0]): loc_window = window[i, ] temp = order.loc[loc_window[0]:loc_window[1]] temp_q = max(np.quantile(temp["select"], q), Q) index_i = temp[temp["select"] >= temp_q].index index = index + list(index_i) index = np.unique(index[1:]) index = np.sort(index) # The following chunk of code does step 5 of the ALSFS algorithm. # The function minimize()) is used to calculate the optimization of the three # linear transformation parameters. # The final estimate is in variable B2. m = len(index) led["intens"] = led["intens"] / np.max(led["intens"].values) * np.max( order["intens"].values) Xnew = led.iloc[index] Xnew["constants"] = np.ones(m) columnsTitles = ["constants", "intens", "wv"] Xnew = Xnew.reindex(columns=columnsTitles) order_new = order.iloc[index] beta = np.array([0, 1, 0]) v1 = order_new["intens"].values m1 = Xnew.values # Define the function to be optimized def f(beta): return np.sum( np.square((np.divide(v1, np.matmul(m1, beta)) - np.ones(m)))) op_result = minimize(f, beta) param = op_result.x B2 = param[1] * led["intens"].values + param[2] * led["wv"].values + param[ 0] return order["intens"].values / B2
def meanVar(_files, _gff_file, _output): NFILE = len(_files) if NFILE == 1: sys.stderr.write("Need at least two samples for each group.\n") sys.exit(1) ## Dictionary of gene counts _dict_counts = dict() _genes = HTSeq.GenomicArrayOfSets("auto", stranded=False) idx = 0 count = 0 transcript = set() cur_line = None lines = 0 for feature in _gff_file: lines += 1 if feature.type in GENE or lines == num_lines: if len(transcript) > 1: _dict_counts[cur_line.name] = [0] * NFILE _genes[cur_line.iv] += cur_line.name count += 1 cur_line = feature transcript.clear() if feature.type in EXON: transcript.add(feature.attr["Parent"]) print "Number of genes", count _file_raw_count = open(_output + '.rawcounts', 'w') _file_nb_count = open(_output + '.nbcounts', 'w') ## This loop read through the input list and call countbam for each input file for f in _files: bam_file = HTSeq.BAM_Reader(f) _dict_counts = countbam(bam_file, _genes, _dict_counts, idx) idx += 1 sys.stderr.write("Library %d has generated.\n" % idx) ## Print raw counts in file specified by <out> for key, value in sorted(_dict_counts.iteritems()): _file_raw_count.write(key + "\t" + "\t".join(map(str, value)) + "\n") _file_raw_count.close() ## Calculate group mean and variance list_mean = list() list_var = list() for key, value in sorted(_dict_counts.iteritems()): list_mean.append(np.mean(np.array(value))) list_var.append(np.var(np.array(value))) ## Computer loess esimates ## The following code is using rpy2 module a = robjects.FloatVector(list_mean) b = robjects.FloatVector(list_var) df = robjects.DataFrame({"mean": a, "var": b}) non0_df = df.rx(df.rx2("mean").ro > 0, True) ## subsetting if mean > 0 loess_fit = r.loess("var ~ mean", data=non0_df, degree=2) var_pred = r.predict(loess_fit, a) # This loop overwrite global variable dict_counts for recoding new count data count_idx = 0 for key, value in sorted(_dict_counts.iteritems()): n = math.pow(list_mean[count_idx], 2) / (var_pred[count_idx] - list_mean[count_idx]) n = int(n) # n: number of failures if n <= 0: _dict_counts[key] = [0] * NREPS else: p = n / float(n + list_mean[count_idx]) # p: prob of success _dict_counts[key] = nbinom.rvs(n, p, size=NREPS).tolist() count_idx += 1 for key, value in sorted(_dict_counts.iteritems()): _file_nb_count.write(key + "\t" + "\t".join(map(str, value)) + "\n") _file_nb_count.close() _file_raw_count.close() return _dict_counts
import rpy2.robjects as ro from rpy2.robjects import r import json from pandas.rpy.common import convert_to_r_dataframe import pandas as pd fitted_model = r.readRDS("models/model1.RDS") # (in real life maybe the json comes from the front end): json_to_predict = '[{"carat":0.23,"cut":"Ideal","color":"E","clarity":"SI2"}]' # Method 1: convert pandas data frame to R data frame to_predict_dict = json.loads(json_to_predict) to_predict_pandas_df = pd.DataFrame(to_predict_dict) to_predict_R_df = convert_to_r_dataframe(to_predict_pandas_df) # Make predictons [prediction] = r.predict(fitted_model, to_predict_R_df) print("Prediction from pandas DF is %f" % prediction) # Method 2: Send the JSON to R, convert to dataframe in R jsonlite = ro.packages.importr("jsonlite") # This gets you an R dataframe (a bit of a weird default for a fromJSON function, but alas): to_predict_R_df_from_json = jsonlite.fromJSON(json_to_predict) # Make predictions [prediction] = r.predict(fitted_model, to_predict_R_df_from_json) print("Prediction from JSON is %f" % prediction)