def fit(self, X, y, tau = 0.5, method = "pogs", na_action = None): """ Fit a quantile regression :param X: :param y: :param tau: :param method: :param na_action: :return: """ if method == "pogs" : return self.fit_pogs(X, y, tau) formula = self.formula_generator(X, y) data = pd.concat([X, y.to_frame()], axis=1, join_axes=[X.index]) t0 = time.time() ret = r.lm(r(formula), data = data) t1 = time.time() output = {"Coefficients":{}, "Time (s)":t1-t0} output["Coefficients"]["Intercept"] = ret[0][0] for i in range(0, len(list(X.columns.values))): column = list(X.columns.values)[i] output["Coefficients"][column] = ret[0][i+1] return output
def through_the_origin(x, y): df = DataFrame({'x': FloatVector(x), 'y': FloatVector(y)}) s = r.summary(r.lm('y ~ 0 + x', df)) return { 'coefficient': s.rx2('coefficients')[0], 'stderr': s.rx2('coefficients')[1], 'r.squared': s.rx2('r.squared')[0] }
def method_spline(rvar, train, test): """ B-splines with interaction """ print("Splines") formula = rvar + ' ~ bs(OverallRank, df=6) + treat + '\ 'treat:bs(OverallRank, df=6) - 1' if rvar == 'Tuition': formula = formula + ' + year' model = r.lm(formula, data=train) #print(r.summary(model).rx2('coefficients')) print(r.summary(model).rx2('r.squared')) #print(r.summary(model)) analytics(rvar, 'Training', train[rvar], np.array(r.predict(model))) if rvar != "UndergraduatemedianGPA": analytics(rvar, 'Testing', test[rvar], np.array(r.predict(model, newdata=test))) print()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: data2multiple_anova.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-c", "--columns", dest="columns", type="string", help="columns to take for calculating histograms.") parser.add_option("-t", "--filename-tree", dest="filename_tree", type="string", help="filename with tree(s).") parser.add_option("--skip-header", dest="add_header", action="store_false", help="do not add header to flat format.") parser.add_option("--write-header", dest="write_header", action="store_true", help="write header and exit.") parser.add_option("--debug", dest="debug", action="store_true", help="debug mode") parser.add_option("--display-tree", dest="display_tree", action="store_true", help="display the tree") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("contrasts", "spearman", "pearson", "compute"), help="methods to perform on contrasts.") parser.set_defaults( columns="all", filename_tree=None, add_header=True, write_header=False, debug=False, methods=[], value_format="%6.4f", pvalue_format="%e", display_tree=False, ) (options, args) = E.Start(parser, quiet=True) if options.columns not in ("all", "all-but-first"): options.columns = map(lambda x: int(x) - 1, options.columns.split(",")) data = [] options.filenames = args for filename in options.filenames: infile = open(filename, "r") table, headers = IOTools.readTable( infile, take=options.columns, headers=False) infile.close() data.append(table) fields = ["Df", "Sum Sq", "F value", "Pr(>F)", "Mean Sq"] options.stdout.write("set1\tset2") for field in fields: options.stdout.write("\t%s" % field) options.stdout.write("\n") # CODE needs to be refactored for rpy2 usage for x in range(len(data)): for y in range(x + 1, len(data)): rpy.set_default_mode(rpy.NO_CONVERSION) factors = ["x"] * len(data[x][:, 0]) + ["y"] * len(data[y][:, 0]) values = list(data[x][:, 0]) + list(data[y][:, 0]) linear_model = R.lm( R("y ~ x"), data=R.data_frame(x=factors, y=values)) rpy.set_default_mode(rpy.BASIC_CONVERSION) result = R.anova(linear_model) options.stdout.write( "%s\t%s" % (options.filenames[x], options.filenames[y])) for field in fields: options.stdout.write("\t%s" % str(result[field])) options.stdout.write("\n")
from numpy import array, rec from numpy.random import normal as nprandom from rpy2.robjects import numpy2ri, r foo = array(range(10)) bar = foo + nprandom(0,1,10) d = rec.fromarrays([foo, bar], names=('foo','bar')) print d fit = r.lm('bar ~ foo', data=d) print fit.rx2('coefficients')
def __init__(self, formula, **kwargs): """ Class for managing linear regression in R. Data are specified with the keyword arguments, which are passed to R's global environment. They are first converted to NumPy arrays. For example, the kwarg `x=[1,2,3,4]` will add the list of four numbers to R's global env with the variable name `x`. You can then access `x` from the formula. `formula` is a string passed verbatim to R's `lm()` function. Example usage:: >>> x = [1, 2, 3, 4] >>> y = [1.2, 3, 7, 10] >>> m = LinearRegression(x=x, y=y, formula='y~x') >>> m.slope 3.0399999999999996 >>> m.intercept -2.299999999999998 >>> m.adj_r_squared 0.97221750212404412 >>> m.slope_pval(0) 0.0093041159117684229 >>> m.intercept_pval(0) 0.10459053583417365 >>> # Variables accessible as NumPy arrays >>> m.x array([1, 2, 3, 4]) Cross-check with scipy.stats.linregress:: >>> from scipy.stats import linregress as scipy_linregress >>> results = scipy_linregress(x, y) >>> eps = 1e-15 >>> assert abs(results[0] - m.slope) < eps >>> eps = 1e-10 >>> assert abs(results[1] - m.intercept) < eps >>> eps = 1e-15 >>> assert abs(results[2] ** 2 - m.r_squared) < eps >>> eps = 1e-15 >>> assert abs(results[3] - m.slope_pval(0)) < eps TODO: - support for more complex models (requires examining the coeffs matrix to see what's included) """ for k, v in kwargs.items(): v = np.array(v) robjects.globalenv[k] = v setattr(self, k, v) self.lm = r.lm(formula) self.summary = r.summary(self.lm) coeffs = self.summary.rx2('coefficients') self._intercept_p, self._slope_p = coeffs[6], coeffs[7]
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: data2multiple_anova.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option("-c", "--columns", dest="columns", type="string", help="columns to take for calculating histograms.") parser.add_option("-t", "--tree-nh-file", dest="filename_tree", type="string", help="filename with tree(s).") parser.add_option("--skip-header", dest="add_header", action="store_false", help="do not add header to flat format.") parser.add_option("--output-with-header", dest="write_header", action="store_true", help="write header and exit.") parser.add_option("--debug", dest="debug", action="store_true", help="debug mode") parser.add_option("--display-tree", dest="display_tree", action="store_true", help="display the tree") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("contrasts", "spearman", "pearson", "compute"), help="methods to perform on contrasts.") parser.set_defaults( columns="all", filename_tree=None, add_header=True, write_header=False, debug=False, methods=[], value_format="%6.4f", pvalue_format="%e", display_tree=False, ) (options, args) = E.Start(parser, quiet=True) if options.columns not in ("all", "all-but-first"): options.columns = [int(x) - 1 for x in options.columns.split(",")] data = [] options.filenames = args for filename in options.filenames: infile = IOTools.openFile(filename, "r") table, headers = IOTools.readTable(infile, take=options.columns, headers=False) infile.close() data.append(table) fields = ["Df", "Sum Sq", "F value", "Pr(>F)", "Mean Sq"] options.stdout.write("set1\tset2") for field in fields: options.stdout.write("\t%s" % field) options.stdout.write("\n") # CODE needs to be refactored for rpy2 usage for x in range(len(data)): for y in range(x + 1, len(data)): rpy.set_default_mode(rpy.NO_CONVERSION) factors = ["x"] * len(data[x][:, 0]) + ["y"] * len(data[y][:, 0]) values = list(data[x][:, 0]) + list(data[y][:, 0]) linear_model = R.lm(R("y ~ x"), data=R.data_frame(x=factors, y=values)) rpy.set_default_mode(rpy.BASIC_CONVERSION) result = R.anova(linear_model) options.stdout.write("%s\t%s" % (options.filenames[x], options.filenames[y])) for field in fields: options.stdout.write("\t%s" % str(result[field])) options.stdout.write("\n")
fields = [ "Df", "Sum Sq", "F value", "Pr(>F)", "Mean Sq"] options.stdout.write("set1\tset2" ) for field in fields: options.stdout.write("\t%s" % field ) options.stdout.write("\n" ) # CODE needs to be refactored for rpy2 usage for x in range( len(data )): for y in range(x+1,len(data)): rpy.set_default_mode(rpy.NO_CONVERSION) factors = ["x"] * len(data[x][:,0]) + ["y"] * len(data[y][:,0]) values = list(data[x][:,0]) + list(data[y][:,0]) linear_model = R.lm(R("y ~ x"), data = R.data_frame(x=factors, y=values )) rpy.set_default_mode(rpy.BASIC_CONVERSION) result = R.anova( linear_model ) options.stdout.write( "%s\t%s" % (options.filenames[x], options.filenames[y]) ) for field in fields: options.stdout.write("\t%s" % str( result[field] ) ) options.stdout.write("\n" )
def spline_est(data, new_data): """ Estimate conditional b-splines for value function """ model = r.lm('val ~ bs(OverallRank, df=4)', data=data) return r.predict(model, newdata=new_data)
import rpy2.robjects as ro from rpy2.robjects import pandas2ri from rpy2.robjects import r as R import pandas as pd # Activating R environment pandas2ri.activate() #R = ro.r # Creating a test DataFrame data = {'a' : [1, 2, 3, 4, 5, 6, 7, 8, 9], 'b' : [11, 12, 13, 14, 15, 16, 17, 18, 19], 'c' : [21, 22, 23, 24, 25, 26, 26, 28, 29] } test = pd.DataFrame(data) print(test.head()) M = R.lm('a ~ b', data=test) print(R.summary(M).rx2('coefficients'))