예제 #1
0
def Glm(x, y, famil = 'gaussian'): 
    """
    Compute the slope and intercept of the 2 given vector generalized linear regression.
    
    :param x: X-axis values
    :param y: Y-axis values
    :param famil: family objects for models and link function
    
    :type x: float list
    :type y: float list
    :type famil: string
    
    :returns: the slope and the intercept of the generalized linear regression
    :rtype: float 

    :note: the 2 vector/list must have the same size
    """
    
    model = rpy.r("Y~X")

    d = rpy.r.data_frame(X=x, Y=y)

    reg = rpy.r.glm(model, data = d, family = famil)

    #print reg
    
    intercept = reg['coefficients']['(Intercept)']
    slope = reg['coefficients']['X']
    family = reg['family']['family']
    #link = reg['link']

    data = {'Intercept':intercept, 'Slope':slope, 'Family': family}#, 'Link':lin}
    return data
예제 #2
0
def multiReg(x, y, colList, alpha):
    #d = rpy.r.data_frame(y)
    newX = []
    d = {'Y':y}
    model_string = "Y~"
    #names = ["Y"]
    for i in colList:
      name = "X" + str(i) 
      #names.append( name )
      d[name] = x[i]
      newX.append(x[i])
      model_string = model_string + name + "+"
    print d
    #rpy.r.colnames(d) = names
    model_string = model_string + "1"
    model = rpy.r(model_string)

    n = rpy.sqrt( len( y ) )
    norm = rpy.r.qnorm( 1. - ( alpha/200. ) )
    Rlm = rpy.with_mode(rpy.NO_CONVERSION, rpy.r.lm)
    reg = Rlm(model, data = d)
    result = rpy.r.summary(reg)
    print result
    coef =result['coefficients']
    r2 = result['r.squared']
    r2adj = result['adj.r.squared']
    ic = result['sigma']*norm/n
    regressor = coef[:, 0]

    data = {'regressor':regressor[1:], 'intercept':regressor[0], 'r2':r2, 'adj_r2':r2adj, 'ic':ic, 'x':newX, 'y':y}
    return data
예제 #3
0
def regLinOri(x, y, alpha=5):
    """
    Compute the slope of the 2 given vector pass-through-origin linear regression.
    
    :Parameters:
     - `x`: X-axis values
     - `y`: Y-axis values

    :Types:
     - `x`: float list
     - `y`: float list

    :returns: the slope of the linear regression
    :returntype: float
    
    :attention: the 2 vector/list must have the same size
    """
    model = rpy.r("Y~-1+X")
    data = regression(x, y, model, alpha )
    return data
예제 #4
0
def regLin(x, y, alpha=5):
    """
    Compute the slope and intercept of the 2 given vector linear regression.
    
    :Parameters:
     - `x`: X-axis values
     - `y`: Y-axis values
    
    :Types:
     - `x`: float list
     - `y`: float list
    
    :returns: the slope and the intercept of the linear regression
    :rtype: float cople

    :attention: the 2 vector/list must have the same size
    """
    model = rpy.r("Y~X")
    data = regression(x, y, model, alpha )
    return data
예제 #5
0
def regLinOri(x, y, alpha=5):
    """
    Compute the slope of the 2 given vector pass-through-origin linear regression.
    
    :Parameters:
     - `x`: X-axis values
     - `y`: Y-axis values

    :Types:
     - `x`: float list
     - `y`: float list

    :returns: the slope of the linear regression
    :returntype: float
    
    :attention: the 2 vector/list must have the same size
    """
    model = rpy.r("Y~-1+X")
    data = regression(x, y, model, alpha)
    return data
예제 #6
0
def regLin(x, y, alpha=5):
    """
    Compute the slope and intercept of the 2 given vector linear regression.
    
    :Parameters:
     - `x`: X-axis values
     - `y`: Y-axis values
    
    :Types:
     - `x`: float list
     - `y`: float list
    
    :returns: the slope and the intercept of the linear regression
    :rtype: float cople

    :attention: the 2 vector/list must have the same size
    """
    model = rpy.r("Y~X")
    data = regression(x, y, model, alpha)
    return data
예제 #7
0
def multiReg(x, y, colList, alpha):
    #d = rpy.r.data_frame(y)
    newX = []
    d = {'Y': y}
    model_string = "Y~"
    #names = ["Y"]
    for i in colList:
        name = "X" + str(i)
        #names.append( name )
        d[name] = x[i]
        newX.append(x[i])
        model_string = model_string + name + "+"
    print d
    #rpy.r.colnames(d) = names
    model_string = model_string + "1"
    model = rpy.r(model_string)

    n = rpy.sqrt(len(y))
    norm = rpy.r.qnorm(1. - (alpha / 200.))
    Rlm = rpy.with_mode(rpy.NO_CONVERSION, rpy.r.lm)
    reg = Rlm(model, data=d)
    result = rpy.r.summary(reg)
    print result
    coef = result['coefficients']
    r2 = result['r.squared']
    r2adj = result['adj.r.squared']
    ic = result['sigma'] * norm / n
    regressor = coef[:, 0]

    data = {
        'regressor': regressor[1:],
        'intercept': regressor[0],
        'r2': r2,
        'adj_r2': r2adj,
        'ic': ic,
        'x': newX,
        'y': y
    }
    return data
예제 #8
0
def Glm(x, y, famil='gaussian'):
    """
    Compute the slope and intercept of the 2 given vector generalized linear regression.
    
    :param x: X-axis values
    :param y: Y-axis values
    :param famil: family objects for models and link function
    
    :type x: float list
    :type y: float list
    :type famil: string
    
    :returns: the slope and the intercept of the generalized linear regression
    :rtype: float 

    :note: the 2 vector/list must have the same size
    """

    model = rpy.r("Y~X")

    d = rpy.r.data_frame(X=x, Y=y)

    reg = rpy.r.glm(model, data=d, family=famil)

    #print reg

    intercept = reg['coefficients']['(Intercept)']
    slope = reg['coefficients']['X']
    family = reg['family']['family']
    #link = reg['link']

    data = {
        'Intercept': intercept,
        'Slope': slope,
        'Family': family
    }  #, 'Link':lin}
    return data
def main():
    try:
        datafile = sys.argv[1]
        outfile_name = sys.argv[2]
        expression = sys.argv[3]
    except:
        stop_err('Usage: python gsummary.py input_file ouput_file expression')

    math_allowed = S3_METHODS()['Math']
    ops_allowed = S3_METHODS()['Ops']

    # Check for invalid expressions
    for word in re.compile('[a-zA-Z]+').findall(expression):
        if word and word not in math_allowed:
            stop_err(
                "Invalid expression '%s': term '%s' is not recognized or allowed"
                % (expression, word))
    symbols = set()
    for symbol in re.compile('[^a-z0-9\s]+').findall(expression):
        if symbol and symbol not in ops_allowed:
            stop_err(
                "Invalid expression '%s': operator '%s' is not recognized or allowed"
                % (expression, symbol))
        else:
            symbols.add(symbol)
    if len(symbols) == 1 and ',' in symbols:
        # User may have entered a comma-separated list r_data_frame columns
        stop_err(
            "Invalid columns '%s': this tool requires a single column or expression"
            % expression)

    # Find all column references in the expression
    cols = []
    for col in re.compile('c[0-9]+').findall(expression):
        try:
            cols.append(int(col[1:]) - 1)
        except:
            pass

    tmp_file = tempfile.NamedTemporaryFile('w+b')
    # Write the R header row to the temporary file
    hdr_str = "\t".join("c%s" % str(col + 1) for col in cols)
    tmp_file.write("%s\n" % hdr_str)
    skipped_lines = 0
    first_invalid_line = 0
    i = 0
    for i, line in enumerate(open(datafile)):
        line = line.rstrip('\r\n')
        if line and not line.startswith('#'):
            valid = True
            fields = line.split('\t')
            # Write the R data row to the temporary file
            for col in cols:
                try:
                    float(fields[col])
                except:
                    skipped_lines += 1
                    if not first_invalid_line:
                        first_invalid_line = i + 1
                    valid = False
                    break
            if valid:
                data_str = "\t".join(fields[col] for col in cols)
                tmp_file.write("%s\n" % data_str)
    tmp_file.flush()

    if skipped_lines == i + 1:
        stop_err(
            "Invalid column or column data values invalid for computation.  See tool tips and syntax for data requirements."
        )
    else:
        # summary function and return labels
        set_default_mode(NO_CONVERSION)
        summary_func = r(
            "function( x ) { c( sum=sum( as.numeric( x ), na.rm=T ), mean=mean( as.numeric( x ), na.rm=T ), stdev=sd( as.numeric( x ), na.rm=T ), quantile( as.numeric( x ), na.rm=TRUE ) ) }"
        )
        headings = ['sum', 'mean', 'stdev', '0%', '25%', '50%', '75%', '100%']
        headings_str = "\t".join(headings)

        r_data_frame = r.read_table(tmp_file.name, header=True, sep="\t")

        outfile = open(outfile_name, 'w')

        for col in re.compile('c[0-9]+').findall(expression):
            r.assign(col, r["$"](r_data_frame, col))
        try:
            summary = summary_func(r(expression))
        except RException as s:
            outfile.close()
            stop_err("Computation resulted in the following error: %s" %
                     str(s))
        summary = summary.as_py(BASIC_CONVERSION)
        outfile.write("#%s\n" % headings_str)
        if type(summary) is dict:
            # using rpy
            outfile.write("%s\n" %
                          "\t".join(["%g" % summary[k] for k in headings]))
        else:
            # using rpy2
            outfile.write("%s\n" % "\t".join(["%g" % k for k in summary]))
        outfile.close()

        if skipped_lines:
            print "Skipped %d invalid lines beginning with line #%d.  See tool tips for data requirements." % (
                skipped_lines, first_invalid_line)
예제 #10
0
def main():
    try:
        datafile = sys.argv[1]
        outfile_name = sys.argv[2]
        expression = sys.argv[3]
    except Exception:
        stop_err('Usage: python gsummary.py input_file ouput_file expression')

    math_allowed = S3_METHODS()['Math']
    ops_allowed = S3_METHODS()['Ops']

    # Check for invalid expressions
    for word in re.compile('[a-zA-Z]+').findall(expression):
        if word and word not in math_allowed:
            stop_err("Invalid expression '%s': term '%s' is not recognized or allowed" % (expression, word))
    symbols = set()
    for symbol in re.compile('[^a-z0-9\s]+').findall(expression):
        if symbol and symbol not in ops_allowed:
            stop_err("Invalid expression '%s': operator '%s' is not recognized or allowed" % (expression, symbol))
        else:
            symbols.add(symbol)
    if len(symbols) == 1 and ',' in symbols:
        # User may have entered a comma-separated list r_data_frame columns
        stop_err("Invalid columns '%s': this tool requires a single column or expression" % expression)

    # Find all column references in the expression
    cols = []
    for col in re.compile('c[0-9]+').findall(expression):
        try:
            cols.append(int(col[1:]) - 1)
        except Exception:
            pass

    tmp_file = tempfile.NamedTemporaryFile('w+')
    # Write the R header row to the temporary file
    hdr_str = "\t".join("c%s" % str(col + 1) for col in cols)
    tmp_file.write("%s\n" % hdr_str)
    skipped_lines = 0
    first_invalid_line = 0
    i = 0
    for i, line in enumerate(open(datafile)):
        line = line.rstrip('\r\n')
        if line and not line.startswith('#'):
            valid = True
            fields = line.split('\t')
            # Write the R data row to the temporary file
            for col in cols:
                try:
                    float(fields[col])
                except Exception:
                    skipped_lines += 1
                    if not first_invalid_line:
                        first_invalid_line = i + 1
                    valid = False
                    break
            if valid:
                data_str = "\t".join(fields[col] for col in cols)
                tmp_file.write("%s\n" % data_str)
    tmp_file.flush()

    if skipped_lines == i + 1:
        stop_err("Invalid column or column data values invalid for computation.  See tool tips and syntax for data requirements.")
    else:
        # summary function and return labels
        set_default_mode(NO_CONVERSION)
        summary_func = r("function( x ) { c( sum=sum( as.numeric( x ), na.rm=T ), mean=mean( as.numeric( x ), na.rm=T ), stdev=sd( as.numeric( x ), na.rm=T ), quantile( as.numeric( x ), na.rm=TRUE ) ) }")
        headings = ['sum', 'mean', 'stdev', '0%', '25%', '50%', '75%', '100%']
        headings_str = "\t".join(headings)

        r_data_frame = r.read_table(tmp_file.name, header=True, sep="\t")

        outfile = open(outfile_name, 'w')

        for col in re.compile('c[0-9]+').findall(expression):
            r.assign(col, r["$"](r_data_frame, col))
        try:
            summary = summary_func(r(expression))
        except RException as s:
            outfile.close()
            stop_err("Computation resulted in the following error: %s" % str(s))
        summary = summary.as_py(BASIC_CONVERSION)
        outfile.write("#%s\n" % headings_str)
        if type(summary) is dict:
            # using rpy
            outfile.write("%s\n" % "\t".join(["%g" % summary[k] for k in headings]))
        else:
            # using rpy2
            outfile.write("%s\n" % "\t".join(["%g" % k for k in summary]))
        outfile.close()

        if skipped_lines:
            print("Skipped %d invalid lines beginning with line #%d.  See tool tips for data requirements." % (skipped_lines, first_invalid_line))