Exemplo n.º 1
0
    def fit(self, X, y, tau = 0.5, method = "pogs", na_action = None):
        """
        Fit a quantile regression

        :param X:
        :param y:
        :param tau:
        :param method:
        :param na_action:
        :return:
        """
        if method == "pogs" :
            return self.fit_pogs(X, y, tau)

        formula = self.formula_generator(X, y)
        data = pd.concat([X, y.to_frame()], axis=1, join_axes=[X.index])

        t0 = time.time()
        ret = r.lm(r(formula), data = data)
        t1 = time.time()

        output = {"Coefficients":{}, "Time (s)":t1-t0}
        output["Coefficients"]["Intercept"] = ret[0][0]

        for i in range(0, len(list(X.columns.values))):
            column = list(X.columns.values)[i]
            output["Coefficients"][column] = ret[0][i+1]

        return output
Exemplo n.º 2
0
def through_the_origin(x, y):
    df = DataFrame({'x': FloatVector(x), 'y': FloatVector(y)})
    s = r.summary(r.lm('y ~ 0 + x', df))
    return {
        'coefficient': s.rx2('coefficients')[0],
        'stderr': s.rx2('coefficients')[1],
        'r.squared': s.rx2('r.squared')[0]
    }
Exemplo n.º 3
0
def method_spline(rvar, train, test):
    """ B-splines with interaction """
    print("Splines")
    formula = rvar + ' ~ bs(OverallRank, df=6) + treat + '\
              'treat:bs(OverallRank, df=6) - 1'
    if rvar == 'Tuition':
        formula = formula + ' + year'
    model = r.lm(formula, data=train)
    #print(r.summary(model).rx2('coefficients'))
    print(r.summary(model).rx2('r.squared'))
    #print(r.summary(model))
    analytics(rvar, 'Training', train[rvar],
              np.array(r.predict(model)))
    if rvar != "UndergraduatemedianGPA":
        analytics(rvar, 'Testing', test[rvar],
                  np.array(r.predict(model, newdata=test)))
    print()
Exemplo n.º 4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: data2multiple_anova.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-c", "--columns", dest="columns", type="string",
                      help="columns to take for calculating histograms.")
    parser.add_option("-t", "--filename-tree", dest="filename_tree", type="string",
                      help="filename with tree(s).")
    parser.add_option("--skip-header", dest="add_header", action="store_false",
                      help="do not add header to flat format.")
    parser.add_option("--write-header", dest="write_header", action="store_true",
                      help="write header and exit.")
    parser.add_option("--debug", dest="debug", action="store_true",
                      help="debug mode")
    parser.add_option("--display-tree", dest="display_tree", action="store_true",
                      help="display the tree")

    parser.add_option("-m", "--method", dest="methods", type="choice", action="append",
                      choices=("contrasts", "spearman", "pearson", "compute"),
                      help="methods to perform on contrasts.")

    parser.set_defaults(
        columns="all",
        filename_tree=None,
        add_header=True,
        write_header=False,
        debug=False,
        methods=[],
        value_format="%6.4f",
        pvalue_format="%e",
        display_tree=False,
    )

    (options, args) = E.Start(parser, quiet=True)

    if options.columns not in ("all", "all-but-first"):
        options.columns = map(lambda x: int(x) - 1, options.columns.split(","))

    data = []

    options.filenames = args

    for filename in options.filenames:

        infile = open(filename, "r")
        table, headers = IOTools.readTable(
            infile, take=options.columns, headers=False)
        infile.close()

        data.append(table)

    fields = ["Df", "Sum Sq", "F value", "Pr(>F)", "Mean Sq"]

    options.stdout.write("set1\tset2")
    for field in fields:
        options.stdout.write("\t%s" % field)
    options.stdout.write("\n")

    # CODE needs to be refactored for rpy2 usage

    for x in range(len(data)):

        for y in range(x + 1, len(data)):

            rpy.set_default_mode(rpy.NO_CONVERSION)

            factors = ["x"] * len(data[x][:, 0]) + ["y"] * len(data[y][:, 0])
            values = list(data[x][:, 0]) + list(data[y][:, 0])

            linear_model = R.lm(
                R("y ~ x"), data=R.data_frame(x=factors, y=values))
            rpy.set_default_mode(rpy.BASIC_CONVERSION)
            result = R.anova(linear_model)

            options.stdout.write(
                "%s\t%s" % (options.filenames[x], options.filenames[y]))
            for field in fields:
                options.stdout.write("\t%s" % str(result[field]))
            options.stdout.write("\n")
Exemplo n.º 5
0
from numpy import array, rec
from numpy.random import normal as nprandom
from rpy2.robjects import numpy2ri, r

foo = array(range(10))
bar = foo + nprandom(0,1,10)

d = rec.fromarrays([foo, bar], names=('foo','bar'))
print d
fit = r.lm('bar ~ foo', data=d)
print fit.rx2('coefficients')
Exemplo n.º 6
0
    def __init__(self, formula, **kwargs):
        """
        Class for managing linear regression in R.

        Data are specified with the keyword arguments, which are passed to R's
        global environment.  They are first converted to NumPy arrays.

        For example, the kwarg `x=[1,2,3,4]` will add the list of four
        numbers to R's global env with the variable name `x`.  You can then
        access `x` from the formula.

        `formula` is a string passed verbatim to R's `lm()` function.

        Example usage::

            >>> x = [1, 2, 3, 4]
            >>> y = [1.2, 3, 7, 10]
            >>> m = LinearRegression(x=x, y=y, formula='y~x')
            >>> m.slope
            3.0399999999999996

            >>> m.intercept
            -2.299999999999998

            >>> m.adj_r_squared
            0.97221750212404412

            >>> m.slope_pval(0)
            0.0093041159117684229

            >>> m.intercept_pval(0)
            0.10459053583417365

            >>> # Variables accessible as NumPy arrays
            >>> m.x
            array([1, 2, 3, 4])

        Cross-check with scipy.stats.linregress::

            >>> from scipy.stats import linregress as scipy_linregress
            >>> results = scipy_linregress(x, y)
            >>> eps = 1e-15
            >>> assert abs(results[0] - m.slope) < eps
            >>> eps = 1e-10
            >>> assert abs(results[1] - m.intercept) < eps
            >>> eps = 1e-15
            >>> assert abs(results[2] ** 2 - m.r_squared) < eps
            >>> eps = 1e-15
            >>> assert abs(results[3] - m.slope_pval(0)) < eps


        TODO:
            - support for more complex models (requires examining the coeffs
              matrix to see what's included)

        """

        for k, v in kwargs.items():
            v = np.array(v)
            robjects.globalenv[k] = v
            setattr(self, k, v)

        self.lm = r.lm(formula)
        self.summary = r.summary(self.lm)
        coeffs = self.summary.rx2('coefficients')
        self._intercept_p, self._slope_p = coeffs[6], coeffs[7]
Exemplo n.º 7
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: data2multiple_anova.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option("-c",
                      "--columns",
                      dest="columns",
                      type="string",
                      help="columns to take for calculating histograms.")
    parser.add_option("-t",
                      "--tree-nh-file",
                      dest="filename_tree",
                      type="string",
                      help="filename with tree(s).")
    parser.add_option("--skip-header",
                      dest="add_header",
                      action="store_false",
                      help="do not add header to flat format.")
    parser.add_option("--output-with-header",
                      dest="write_header",
                      action="store_true",
                      help="write header and exit.")
    parser.add_option("--debug",
                      dest="debug",
                      action="store_true",
                      help="debug mode")
    parser.add_option("--display-tree",
                      dest="display_tree",
                      action="store_true",
                      help="display the tree")

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("contrasts", "spearman", "pearson", "compute"),
                      help="methods to perform on contrasts.")

    parser.set_defaults(
        columns="all",
        filename_tree=None,
        add_header=True,
        write_header=False,
        debug=False,
        methods=[],
        value_format="%6.4f",
        pvalue_format="%e",
        display_tree=False,
    )

    (options, args) = E.Start(parser, quiet=True)

    if options.columns not in ("all", "all-but-first"):
        options.columns = [int(x) - 1 for x in options.columns.split(",")]

    data = []

    options.filenames = args

    for filename in options.filenames:

        infile = IOTools.openFile(filename, "r")
        table, headers = IOTools.readTable(infile,
                                           take=options.columns,
                                           headers=False)
        infile.close()

        data.append(table)

    fields = ["Df", "Sum Sq", "F value", "Pr(>F)", "Mean Sq"]

    options.stdout.write("set1\tset2")
    for field in fields:
        options.stdout.write("\t%s" % field)
    options.stdout.write("\n")

    # CODE needs to be refactored for rpy2 usage

    for x in range(len(data)):

        for y in range(x + 1, len(data)):

            rpy.set_default_mode(rpy.NO_CONVERSION)

            factors = ["x"] * len(data[x][:, 0]) + ["y"] * len(data[y][:, 0])
            values = list(data[x][:, 0]) + list(data[y][:, 0])

            linear_model = R.lm(R("y ~ x"),
                                data=R.data_frame(x=factors, y=values))
            rpy.set_default_mode(rpy.BASIC_CONVERSION)
            result = R.anova(linear_model)

            options.stdout.write("%s\t%s" %
                                 (options.filenames[x], options.filenames[y]))
            for field in fields:
                options.stdout.write("\t%s" % str(result[field]))
            options.stdout.write("\n")
Exemplo n.º 8
0
    fields = [ "Df", "Sum Sq", "F value", "Pr(>F)", "Mean Sq"]
    
    options.stdout.write("set1\tset2" )
    for field in fields:
        options.stdout.write("\t%s" % field )
    options.stdout.write("\n" )

    # CODE needs to be refactored for rpy2 usage

    for x in range( len(data )):

        for y in range(x+1,len(data)):

            rpy.set_default_mode(rpy.NO_CONVERSION)
            
            factors = ["x"] * len(data[x][:,0]) + ["y"] * len(data[y][:,0])
            values = list(data[x][:,0]) + list(data[y][:,0])

            linear_model = R.lm(R("y ~ x"), data = R.data_frame(x=factors, y=values ))
            rpy.set_default_mode(rpy.BASIC_CONVERSION)
            result = R.anova( linear_model )
            
            options.stdout.write( "%s\t%s" % (options.filenames[x], options.filenames[y]) )
            for field in fields:
                options.stdout.write("\t%s" % str( result[field] ) )
            options.stdout.write("\n" )




Exemplo n.º 9
0
 def spline_est(data, new_data):
     """ Estimate conditional b-splines for value function """
     model = r.lm('val ~ bs(OverallRank, df=4)', data=data)
     return r.predict(model, newdata=new_data)
Exemplo n.º 10
0
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects import r as R
import pandas as pd

# Activating R environment
pandas2ri.activate()
#R = ro.r



# Creating a test DataFrame
data = {'a' : [1, 2, 3, 4, 5, 6, 7, 8, 9],
        'b' : [11, 12, 13, 14, 15, 16, 17, 18, 19],
        'c' : [21, 22, 23, 24, 25, 26, 26, 28, 29]        
}
 
test = pd.DataFrame(data)

print(test.head())

M = R.lm('a ~ b', data=test)


print(R.summary(M).rx2('coefficients'))