def Glm(x, y, famil = 'gaussian'): """ Compute the slope and intercept of the 2 given vector generalized linear regression. :param x: X-axis values :param y: Y-axis values :param famil: family objects for models and link function :type x: float list :type y: float list :type famil: string :returns: the slope and the intercept of the generalized linear regression :rtype: float :note: the 2 vector/list must have the same size """ model = rpy.r("Y~X") d = rpy.r.data_frame(X=x, Y=y) reg = rpy.r.glm(model, data = d, family = famil) #print reg intercept = reg['coefficients']['(Intercept)'] slope = reg['coefficients']['X'] family = reg['family']['family'] #link = reg['link'] data = {'Intercept':intercept, 'Slope':slope, 'Family': family}#, 'Link':lin} return data
def multiReg(x, y, colList, alpha): #d = rpy.r.data_frame(y) newX = [] d = {'Y':y} model_string = "Y~" #names = ["Y"] for i in colList: name = "X" + str(i) #names.append( name ) d[name] = x[i] newX.append(x[i]) model_string = model_string + name + "+" print d #rpy.r.colnames(d) = names model_string = model_string + "1" model = rpy.r(model_string) n = rpy.sqrt( len( y ) ) norm = rpy.r.qnorm( 1. - ( alpha/200. ) ) Rlm = rpy.with_mode(rpy.NO_CONVERSION, rpy.r.lm) reg = Rlm(model, data = d) result = rpy.r.summary(reg) print result coef =result['coefficients'] r2 = result['r.squared'] r2adj = result['adj.r.squared'] ic = result['sigma']*norm/n regressor = coef[:, 0] data = {'regressor':regressor[1:], 'intercept':regressor[0], 'r2':r2, 'adj_r2':r2adj, 'ic':ic, 'x':newX, 'y':y} return data
def regLinOri(x, y, alpha=5): """ Compute the slope of the 2 given vector pass-through-origin linear regression. :Parameters: - `x`: X-axis values - `y`: Y-axis values :Types: - `x`: float list - `y`: float list :returns: the slope of the linear regression :returntype: float :attention: the 2 vector/list must have the same size """ model = rpy.r("Y~-1+X") data = regression(x, y, model, alpha ) return data
def regLin(x, y, alpha=5): """ Compute the slope and intercept of the 2 given vector linear regression. :Parameters: - `x`: X-axis values - `y`: Y-axis values :Types: - `x`: float list - `y`: float list :returns: the slope and the intercept of the linear regression :rtype: float cople :attention: the 2 vector/list must have the same size """ model = rpy.r("Y~X") data = regression(x, y, model, alpha ) return data
def regLinOri(x, y, alpha=5): """ Compute the slope of the 2 given vector pass-through-origin linear regression. :Parameters: - `x`: X-axis values - `y`: Y-axis values :Types: - `x`: float list - `y`: float list :returns: the slope of the linear regression :returntype: float :attention: the 2 vector/list must have the same size """ model = rpy.r("Y~-1+X") data = regression(x, y, model, alpha) return data
def regLin(x, y, alpha=5): """ Compute the slope and intercept of the 2 given vector linear regression. :Parameters: - `x`: X-axis values - `y`: Y-axis values :Types: - `x`: float list - `y`: float list :returns: the slope and the intercept of the linear regression :rtype: float cople :attention: the 2 vector/list must have the same size """ model = rpy.r("Y~X") data = regression(x, y, model, alpha) return data
def multiReg(x, y, colList, alpha): #d = rpy.r.data_frame(y) newX = [] d = {'Y': y} model_string = "Y~" #names = ["Y"] for i in colList: name = "X" + str(i) #names.append( name ) d[name] = x[i] newX.append(x[i]) model_string = model_string + name + "+" print d #rpy.r.colnames(d) = names model_string = model_string + "1" model = rpy.r(model_string) n = rpy.sqrt(len(y)) norm = rpy.r.qnorm(1. - (alpha / 200.)) Rlm = rpy.with_mode(rpy.NO_CONVERSION, rpy.r.lm) reg = Rlm(model, data=d) result = rpy.r.summary(reg) print result coef = result['coefficients'] r2 = result['r.squared'] r2adj = result['adj.r.squared'] ic = result['sigma'] * norm / n regressor = coef[:, 0] data = { 'regressor': regressor[1:], 'intercept': regressor[0], 'r2': r2, 'adj_r2': r2adj, 'ic': ic, 'x': newX, 'y': y } return data
def Glm(x, y, famil='gaussian'): """ Compute the slope and intercept of the 2 given vector generalized linear regression. :param x: X-axis values :param y: Y-axis values :param famil: family objects for models and link function :type x: float list :type y: float list :type famil: string :returns: the slope and the intercept of the generalized linear regression :rtype: float :note: the 2 vector/list must have the same size """ model = rpy.r("Y~X") d = rpy.r.data_frame(X=x, Y=y) reg = rpy.r.glm(model, data=d, family=famil) #print reg intercept = reg['coefficients']['(Intercept)'] slope = reg['coefficients']['X'] family = reg['family']['family'] #link = reg['link'] data = { 'Intercept': intercept, 'Slope': slope, 'Family': family } #, 'Link':lin} return data
def main(): try: datafile = sys.argv[1] outfile_name = sys.argv[2] expression = sys.argv[3] except: stop_err('Usage: python gsummary.py input_file ouput_file expression') math_allowed = S3_METHODS()['Math'] ops_allowed = S3_METHODS()['Ops'] # Check for invalid expressions for word in re.compile('[a-zA-Z]+').findall(expression): if word and word not in math_allowed: stop_err( "Invalid expression '%s': term '%s' is not recognized or allowed" % (expression, word)) symbols = set() for symbol in re.compile('[^a-z0-9\s]+').findall(expression): if symbol and symbol not in ops_allowed: stop_err( "Invalid expression '%s': operator '%s' is not recognized or allowed" % (expression, symbol)) else: symbols.add(symbol) if len(symbols) == 1 and ',' in symbols: # User may have entered a comma-separated list r_data_frame columns stop_err( "Invalid columns '%s': this tool requires a single column or expression" % expression) # Find all column references in the expression cols = [] for col in re.compile('c[0-9]+').findall(expression): try: cols.append(int(col[1:]) - 1) except: pass tmp_file = tempfile.NamedTemporaryFile('w+b') # Write the R header row to the temporary file hdr_str = "\t".join("c%s" % str(col + 1) for col in cols) tmp_file.write("%s\n" % hdr_str) skipped_lines = 0 first_invalid_line = 0 i = 0 for i, line in enumerate(open(datafile)): line = line.rstrip('\r\n') if line and not line.startswith('#'): valid = True fields = line.split('\t') # Write the R data row to the temporary file for col in cols: try: float(fields[col]) except: skipped_lines += 1 if not first_invalid_line: first_invalid_line = i + 1 valid = False break if valid: data_str = "\t".join(fields[col] for col in cols) tmp_file.write("%s\n" % data_str) tmp_file.flush() if skipped_lines == i + 1: stop_err( "Invalid column or column data values invalid for computation. See tool tips and syntax for data requirements." ) else: # summary function and return labels set_default_mode(NO_CONVERSION) summary_func = r( "function( x ) { c( sum=sum( as.numeric( x ), na.rm=T ), mean=mean( as.numeric( x ), na.rm=T ), stdev=sd( as.numeric( x ), na.rm=T ), quantile( as.numeric( x ), na.rm=TRUE ) ) }" ) headings = ['sum', 'mean', 'stdev', '0%', '25%', '50%', '75%', '100%'] headings_str = "\t".join(headings) r_data_frame = r.read_table(tmp_file.name, header=True, sep="\t") outfile = open(outfile_name, 'w') for col in re.compile('c[0-9]+').findall(expression): r.assign(col, r["$"](r_data_frame, col)) try: summary = summary_func(r(expression)) except RException as s: outfile.close() stop_err("Computation resulted in the following error: %s" % str(s)) summary = summary.as_py(BASIC_CONVERSION) outfile.write("#%s\n" % headings_str) if type(summary) is dict: # using rpy outfile.write("%s\n" % "\t".join(["%g" % summary[k] for k in headings])) else: # using rpy2 outfile.write("%s\n" % "\t".join(["%g" % k for k in summary])) outfile.close() if skipped_lines: print "Skipped %d invalid lines beginning with line #%d. See tool tips for data requirements." % ( skipped_lines, first_invalid_line)
def main(): try: datafile = sys.argv[1] outfile_name = sys.argv[2] expression = sys.argv[3] except Exception: stop_err('Usage: python gsummary.py input_file ouput_file expression') math_allowed = S3_METHODS()['Math'] ops_allowed = S3_METHODS()['Ops'] # Check for invalid expressions for word in re.compile('[a-zA-Z]+').findall(expression): if word and word not in math_allowed: stop_err("Invalid expression '%s': term '%s' is not recognized or allowed" % (expression, word)) symbols = set() for symbol in re.compile('[^a-z0-9\s]+').findall(expression): if symbol and symbol not in ops_allowed: stop_err("Invalid expression '%s': operator '%s' is not recognized or allowed" % (expression, symbol)) else: symbols.add(symbol) if len(symbols) == 1 and ',' in symbols: # User may have entered a comma-separated list r_data_frame columns stop_err("Invalid columns '%s': this tool requires a single column or expression" % expression) # Find all column references in the expression cols = [] for col in re.compile('c[0-9]+').findall(expression): try: cols.append(int(col[1:]) - 1) except Exception: pass tmp_file = tempfile.NamedTemporaryFile('w+') # Write the R header row to the temporary file hdr_str = "\t".join("c%s" % str(col + 1) for col in cols) tmp_file.write("%s\n" % hdr_str) skipped_lines = 0 first_invalid_line = 0 i = 0 for i, line in enumerate(open(datafile)): line = line.rstrip('\r\n') if line and not line.startswith('#'): valid = True fields = line.split('\t') # Write the R data row to the temporary file for col in cols: try: float(fields[col]) except Exception: skipped_lines += 1 if not first_invalid_line: first_invalid_line = i + 1 valid = False break if valid: data_str = "\t".join(fields[col] for col in cols) tmp_file.write("%s\n" % data_str) tmp_file.flush() if skipped_lines == i + 1: stop_err("Invalid column or column data values invalid for computation. See tool tips and syntax for data requirements.") else: # summary function and return labels set_default_mode(NO_CONVERSION) summary_func = r("function( x ) { c( sum=sum( as.numeric( x ), na.rm=T ), mean=mean( as.numeric( x ), na.rm=T ), stdev=sd( as.numeric( x ), na.rm=T ), quantile( as.numeric( x ), na.rm=TRUE ) ) }") headings = ['sum', 'mean', 'stdev', '0%', '25%', '50%', '75%', '100%'] headings_str = "\t".join(headings) r_data_frame = r.read_table(tmp_file.name, header=True, sep="\t") outfile = open(outfile_name, 'w') for col in re.compile('c[0-9]+').findall(expression): r.assign(col, r["$"](r_data_frame, col)) try: summary = summary_func(r(expression)) except RException as s: outfile.close() stop_err("Computation resulted in the following error: %s" % str(s)) summary = summary.as_py(BASIC_CONVERSION) outfile.write("#%s\n" % headings_str) if type(summary) is dict: # using rpy outfile.write("%s\n" % "\t".join(["%g" % summary[k] for k in headings])) else: # using rpy2 outfile.write("%s\n" % "\t".join(["%g" % k for k in summary])) outfile.close() if skipped_lines: print("Skipped %d invalid lines beginning with line #%d. See tool tips for data requirements." % (skipped_lines, first_invalid_line))