def createGraphSeries(cohort): hm = ['bothR5','bothDX','esR5','esDX','trofileR5','trofileDX','mixedR5DX'] stat_data = DataFrame.from_csvfile(cohort+'.seq.out', sep = "\t") # Begin witing graphs to pdf grdevices.pdf(file=cohort+".stats.pdf",width=7,height=7) # graph 1 scatter plot scatterPlot(stat_data,'bothR5','bothDX',cohort.upper()+' cohort\n bothR5/bothDX correlation') scatterPlot(stat_data,'esR5','esDX',cohort.upper()+' cohort\n esR5/esDX correlation') scatterPlot(stat_data,'trofileR5','trofileDX',cohort.upper()+' cohort\n trofileR5/trofileDX correlation') scatterPlot(stat_data,'bothR5','esR5',cohort.upper()+' cohort\n bothR5/esR5 correlation') scatterPlot(stat_data,'trofileR5','esR5',cohort.upper()+' cohort\n trofileR5/esR5 correlation') scatterPlot(stat_data,'bothDX','esDX',cohort.upper()+' cohort\n bothDX/esDX correlation') scatterPlot(stat_data,'trofileDX','esDX',cohort.upper()+' cohort\n trofileDX/esDX correlation') scatterPlot(stat_data,'bothDX','mixedR5DX',cohort.upper()+' cohort\n bothDX/mixedR5DX correlation') for hmm in hm: hmm_scores = DataFrame.from_csvfile(cohort+'.seq.'+hmm+'.stats',sep="\t") # graph 3 plot(hmm_scores.rx2("Cutoff"),hmm_scores.rx2("Accuracy"),type='o',main='Accuracy vs. Cutoff '+cohort.upper()+' Cohort\n'+hmm+'.hmm',xlab='Cutoff',ylab='Accuracy') # graph 4 plot(hmm_scores.rx2("FPP"),hmm_scores.rx2("TPP"),type='o',xlim=base.c(0,1),ylim=base.c(0,1),main=cohort.upper()+' Cohort ROC\n'+hmm+'.hmm',xlab='% False Pos',ylab='% False Neg') # graph 5 plot(hmm_scores.rx2("Cutoff"),hmm_scores.rx2("Phi"),type='o',xlim=base.c(50,100),ylim=base.c(0,0.75),main=cohort.upper()+' Cohort Association Coeff\n'+hmm+'.hmm',xlab='Cutoff',ylab='Association Coefficient') # close pdf file grdevices.dev_off() return
def analyzeR(): mypatchData = DataFrame.from_csvfile(inputfile,header=True, sep = "\t") ici = list(mypatchData.colnames).index('ic50_val') starti = ici + 1 lasti = mypatchData.nrow myData = mypatchData[,ici:lasti]
def loadfiles(self): """ Load files into R environment """ rcount = 0 names = robjects.r['names'] # Set the default parameter for reading from csv param = {'sep': '\t', 'header': True, 'as_is': True, 'row.names': ri.NULL} # Check the correct parameter and set the default for p in param.keys(): if p in self.param: if self.param[p] is not None: param[p] = self.param[p] self.param.update(param) # Read all the files in the R-environment for f, s in zip(self.filelist, self.seplist): try: tmpdata = DataFrame.from_csvfile(f, sep=str(s), header=param['header'], as_is=param['as_is'], row_names=param['row.names']) self.mylist.append(tmpdata) fdir, fname = os.path.split(os.path.splitext(f)[0]) self.listname.append(fname) rcount += 1 except IOError, e: self.error += e
def clusterPop(admix, indexes): if len(indexes)==1: return [indexes[0]] subf = "adm.%d" % (os.getpid(),) w = open(subf, "w") f = open(admix) fPos = 0 oldOrder = [] for l in f: if fPos in indexes: w.write(l) oldOrder.append(fPos) fPos += 1 f.close() w.close() df = DataFrame.from_csvfile(subf, sep=" ", header=False) d=robjects.r.hclust(robjects.r.dist(df)) for name, value in d.items(): if name=="order": order = value break os.remove(subf) reOrder = [] for pos in order: reOrder.append(oldOrder[pos-1]) return reOrder
def hae_ennakkoilmoitukset(tiedosto): if not os.path.exists(tiedosto): tiedosto = os.path.join(DATADIR, tiedosto) if not os.path.exists(tiedosto): raise IOError("Annettua tiedostoa %s ei löydy" % tiedosto) return DataFrame.from_csvfile(tiedosto, header=True, sep=',', as_is=True)
def clusterAll(admix, myPop): f = open(admix) ls = f.readlines() f.close() numK = len(ls[0].split(" ")) pops = {} for i in range(len(ls)): vals = [float(x) for x in ls[i].rstrip().split(" ")] cnt, accu = pops.get(myPop[i], (0, [0.0]*numK)) cnt += 1 for i2 in range(numK): accu[i2] += vals[i2] pops[myPop[i]] = cnt, accu popNames = list(pops.keys()) popNames.sort() subf = "accu.%d" % (os.getpid(),) w = open(subf, "w") for popName in popNames: myVals = pops[popName][1] cnt = pops[popName][0] myVals = [x/cnt for x in myVals] w.write(" ".join([str(x) for x in myVals])) w.write("\n") w.close() df = DataFrame.from_csvfile(subf, sep=" ", header=False) d=robjects.r.hclust(robjects.r.dist(df, method="max"), method="complete") for name, value in d.items(): if name=="order": order = value break os.remove(subf) popOrder = [] for pos in order: popOrder.append(popNames[pos-1]) return popOrder
def createGraphSeries(cohort,t,sfiles,ofile): hm = ['bothR5','bothDX','esR5','esDX','trofileR5','trofileDX','mixedR5DX'] # Begin witing graphs to pdf grdevices.pdf(file=cohort+".stats.pdf",width=7,height=7) if t == 'both' or t =='out': stat_data = DataFrame.from_csvfile(ofile, sep = "\t") # graph 1 scatter plot scatterPlot(stat_data,'bothR5','bothDX',cohort.upper()+' cohort\n bothR5/bothDX correlation') scatterPlot(stat_data,'esR5','esDX',cohort.upper()+' cohort\n esR5/esDX correlation') scatterPlot(stat_data,'trofileR5','trofileDX',cohort.upper()+' cohort\n trofileR5/trofileDX correlation') scatterPlot(stat_data,'bothR5','esR5',cohort.upper()+' cohort\n bothR5/esR5 correlation') scatterPlot(stat_data,'trofileR5','esR5',cohort.upper()+' cohort\n trofileR5/esR5 correlation') scatterPlot(stat_data,'bothDX','esDX',cohort.upper()+' cohort\n bothDX/esDX correlation') scatterPlot(stat_data,'trofileDX','esDX',cohort.upper()+' cohort\n trofileDX/esDX correlation') scatterPlot(stat_data,'bothDX','mixedR5DX',cohort.upper()+' cohort\n bothDX/mixedR5DX correlation') if t == 'both' or t =='stats': for hmm in hm: if sfiles.has_key(hmm): f = sfiles[hmm] hmm_scores = DataFrame.from_csvfile(f,sep="\t") # graph 3 plot(hmm_scores.rx2("Cutoff"),hmm_scores.rx2("Accuracy"),type='o',main='Accuracy vs. Cutoff '+cohort.upper()+' Cohort\n'+hmm+'.hmm',xlab='Cutoff',ylab='Accuracy') # graph 4 plot(hmm_scores.rx2("FPP"),hmm_scores.rx2("TPP"),type='o',xlim=base.c(0,1),ylim=base.c(0,1),main=cohort.upper()+' Cohort ROC\n'+hmm+'.hmm',xlab='% False Pos',ylab='% False Neg') # graph 5 plot(hmm_scores.rx2("Cutoff"),hmm_scores.rx2("Phi"),type='o',xlim=base.c(50,100),ylim=base.c(0,0.75),main=cohort.upper()+' Cohort Association Coeff\n'+hmm+'.hmm',xlab='Cutoff',ylab='Association Coefficient') # graph 6 plot(hmm_scores.rx2("Cutoff"),hmm_scores.rx2("Specificity"),type='o',main='Specificity vs. Cutoff '+cohort.upper()+' Cohort\n'+hmm+'.hmm',xlab='Cutoff',ylab='Specificity') # graph 7 plot(hmm_scores.rx2("Cutoff"),hmm_scores.rx2("Sensitivity"),type='o',main='Sensitivity vs. Cutoff '+cohort.upper()+' Cohort\n'+hmm+'.hmm',xlab='Cutoff',ylab='Sensitivity') # graph 8 plot(hmm_scores.rx2("Sensitivity"),hmm_scores.rx2("Specificity"),type='o',main='Sensitivity vs. Specificity '+cohort.upper()+' Cohort\n'+hmm+'.hmm',xlab='Sensitivity',ylab='Specificity') # close pdf file grdevices.dev_off() return
def loadfiles(self): """ Load files into R environment """ rcount = 0 asmatrix = robjects.r['as.matrix'] diag = robjects.r['diag'] names = robjects.r['names'] ## Set the default parameter for reading from csv param = {'header': True, 'as_is': True, 'row.names': ri.RNULLArg} ## Check the correct parameter and set the default for p in param.keys(): if p in self.param: if self.param[p] is not None: param[p] = self.param[p] for f, s in zip(self.filelist, self.seplist): try: dataf = DataFrame.from_csvfile(f, sep=str(s), header=param['header'], as_is=param['as_is'], row_names=param['row.names']) dataf = asmatrix(dataf) # Should be the diagonal set to 0? # Do it for all the inputs, just to be sure zcount = 0 for i in xrange(dataf.ncol): if (dataf.rx(i+1,i+1)[0] - 0.0 >= 1e-8): zcount += 1 dataf.rx[i+1,i+1] = 0 if zcount: self.e += f self.mylist.append(dataf) fdir, fname = os.path.split(os.path.splitext(f)[0]) self.listname.append(fname) rcount += 1 except IOError, e: self.error += e except RRuntimeError, e: self.error += e
def stepwise_regression(data, d_v, i_vs): # __file = tempfile.NamedTemporaryFile(delete=False) __file = open('/home/foodfan/haha','wb') __file.writelines(data) stats = importr('stats') pat = '%s~%s' % (d_v, '+'.join(i_vs)) print pat # return None __file.close() data_from_input = DataFrame.from_csvfile(__file.name) reg = stats.lm(pat, data_from_input) st = stats.step(reg, direction = 'backward') ret = str(st[0]) # print '------------------------------------------------------------' # # print '------------------------------------------------------------' return ret
def csv2graph(csvfiles, seplist=[], param={},filepath='.', graph_format='gml'): """ Utility to convert from csv file to igraph format file """ igraph = importr('igraph') gadj = igraph.graph_adjacency wgraph = igraph.write_graph if len(seplist) != len(csvfiles): raise IOError('Not enought separators') for i,f in enumerate(csvfiles): myfname = f + ".%s" % format tmpdata = DataFrame.from_csvfile(f, sep=seplist[i], header=param['header'] if param.has_key('header') else True, as_is=True, row_names=param['row.names'] if param.has_key('row_names') else False) g = gadj(reslist, mode='undirected', weighted=True) wgraph(g, file=os.path.join(filepath,myfname), format=format) return True
def from_csv(cls, data): return cls(DataFrame.from_csvfile(str(data)))
def show4(): open4() r.source('D:/Postgraduate/Course/2-semester/R-language/TimeAnalyze/Programe/R/end.R',encoding="utf-8") data = DataFrame.from_csvfile('D:/Postgraduate/Course/2-semester/R-language/TimeAnalyze/Programe/temp/project2.csv') pp = ggplot2.ggplot(data)+ggplot2.aes_string(x='day', y='time',fill = 'factor(project)')+ggplot2.geom_bar(stat ='identity',position = 'dodge')+ggplot2.ggtitle("两项目时间对比图")+ggplot2.labs(x='日期',y='时间 (min)')+ggplot2.theme(**{'axis.text.x': ggplot2.element_text(angle = 45)}) pp.plot()
def hae_ehdokkaat(): in_tiedosto = os.path.join(DATADIR, 'e2011ehd.csv') return DataFrame.from_csvfile(in_tiedosto, header=True, sep='\t', as_is=True)
def main(): try: datafile = sys.argv[1] outfile_name = sys.argv[2] expression = sys.argv[3] except: stop_err('Usage: python gsummary.py input_file ouput_file expression') math_allowed = S3_METHODS()['Math'] ops_allowed = S3_METHODS()['Ops'] # Check for invalid expressions for word in re.compile('[a-zA-Z]+').findall(expression): if word and not word in math_allowed: stop_err( "Invalid expression '%s': term '%s' is not recognized or allowed" % (expression, word)) symbols = set() for symbol in re.compile('[^a-z0-9\s]+').findall(expression): if symbol and not symbol in ops_allowed: stop_err( "Invalid expression '%s': operator '%s' is not recognized or allowed" % (expression, symbol)) else: symbols.add(symbol) if len(symbols) == 1 and ',' in symbols: # User may have entered a comma-separated list r_data_frame columns stop_err( "Invalid columns '%s': this tool requires a single column or expression" % expression) # Find all column references in the expression cols = [] for col in re.compile('c[0-9]+').findall(expression): try: cols.append(int(col[1:]) - 1) except: pass tmp_file = tempfile.NamedTemporaryFile('w+b') # Write the R header row to the temporary file hdr_str = "\t".join("c%s" % str(col + 1) for col in cols) tmp_file.write("%s\n" % hdr_str) skipped_lines = 0 first_invalid_line = 0 i = 0 for i, line in enumerate(file(datafile)): line = line.rstrip('\r\n') if line and not line.startswith('#'): valid = True fields = line.split('\t') # Write the R data row to the temporary file for col in cols: try: float(fields[col]) except: skipped_lines += 1 if not first_invalid_line: first_invalid_line = i + 1 valid = False break if valid: data_str = "\t".join(fields[col] for col in cols) tmp_file.write("%s\n" % data_str) tmp_file.flush() if skipped_lines == i + 1: stop_err( "Invalid column or column data values invalid for computation. See tool tips and syntax for data requirements." ) else: # summary function and return labels summary_func = r( "function( x ) { c( sum=sum( as.numeric( x ), na.rm=T ), mean=mean( as.numeric( x ), na.rm=T ), stdev=sd( as.numeric( x ), na.rm=T ), quantile( as.numeric( x ), na.rm=TRUE ) ) }" ) headings = ['sum', 'mean', 'stdev', '0%', '25%', '50%', '75%', '100%'] headings_str = "\t".join(headings) #r.set_default_mode( NO_CONVERSION ) #r_data_frame = r.read_table( tmp_file.name, header=True, sep="\t" ) r_data_frame = DataFrame.from_csvfile(tmp_file.name, header=True, sep="\t") outfile = open(outfile_name, 'w') for col in re.compile('c[0-9]+').findall(expression): r.assign(col, r["$"](r_data_frame, col)) try: summary = summary_func(r(expression)) except RException, s: outfile.close() stop_err("Computation resulted in the following error: %s" % str(s)) #summary = summary.as_py( BASIC_CONVERSION ) outfile.write("#%s\n" % headings_str) print summary print summary.r_repr() outfile.write( "%s\n" % "\t".join(["%g" % (summary.rx2(k)[0]) for k in headings])) outfile.close() if skipped_lines: print "Skipped %d invalid lines beginning with line #%d. See tool tips for data requirements." % ( skipped_lines, first_invalid_line)
def main(): try: datafile = sys.argv[1] outfile_name = sys.argv[2] expression = sys.argv[3] except: stop_err( 'Usage: python gsummary.py input_file ouput_file expression' ) math_allowed = S3_METHODS()[ 'Math' ] ops_allowed = S3_METHODS()[ 'Ops' ] # Check for invalid expressions for word in re.compile( '[a-zA-Z]+' ).findall( expression ): if word and not word in math_allowed: stop_err( "Invalid expression '%s': term '%s' is not recognized or allowed" %( expression, word ) ) symbols = set() for symbol in re.compile( '[^a-z0-9\s]+' ).findall( expression ): if symbol and not symbol in ops_allowed: stop_err( "Invalid expression '%s': operator '%s' is not recognized or allowed" % ( expression, symbol ) ) else: symbols.add( symbol ) if len( symbols ) == 1 and ',' in symbols: # User may have entered a comma-separated list r_data_frame columns stop_err( "Invalid columns '%s': this tool requires a single column or expression" % expression ) # Find all column references in the expression cols = [] for col in re.compile( 'c[0-9]+' ).findall( expression ): try: cols.append( int( col[1:] ) - 1 ) except: pass tmp_file = tempfile.NamedTemporaryFile( 'w+b' ) # Write the R header row to the temporary file hdr_str = "\t".join( "c%s" % str( col+1 ) for col in cols ) tmp_file.write( "%s\n" % hdr_str ) skipped_lines = 0 first_invalid_line = 0 i = 0 for i, line in enumerate( file( datafile ) ): line = line.rstrip( '\r\n' ) if line and not line.startswith( '#' ): valid = True fields = line.split( '\t' ) # Write the R data row to the temporary file for col in cols: try: float( fields[ col ] ) except: skipped_lines += 1 if not first_invalid_line: first_invalid_line = i + 1 valid = False break if valid: data_str = "\t".join( fields[ col ] for col in cols ) tmp_file.write( "%s\n" % data_str ) tmp_file.flush() if skipped_lines == i + 1: stop_err( "Invalid column or column data values invalid for computation. See tool tips and syntax for data requirements." ) else: # summary function and return labels summary_func = r( "function( x ) { c( sum=sum( as.numeric( x ), na.rm=T ), mean=mean( as.numeric( x ), na.rm=T ), stdev=sd( as.numeric( x ), na.rm=T ), quantile( as.numeric( x ), na.rm=TRUE ) ) }" ) headings = [ 'sum', 'mean', 'stdev', '0%', '25%', '50%', '75%', '100%' ] headings_str = "\t".join( headings ) #r.set_default_mode( NO_CONVERSION ) #r_data_frame = r.read_table( tmp_file.name, header=True, sep="\t" ) r_data_frame = DataFrame.from_csvfile( tmp_file.name, header=True, sep="\t" ) outfile = open( outfile_name, 'w' ) for col in re.compile( 'c[0-9]+' ).findall( expression ): r.assign( col, r[ "$" ]( r_data_frame, col ) ) try: summary = summary_func( r( expression ) ) except RException, s: outfile.close() stop_err( "Computation resulted in the following error: %s" % str( s ) ) #summary = summary.as_py( BASIC_CONVERSION ) outfile.write( "#%s\n" % headings_str ) print summary print summary.r_repr() outfile.write( "%s\n" % "\t".join( [ "%g" % ( summary.rx2( k )[0] ) for k in headings ] ) ) outfile.close() if skipped_lines: print "Skipped %d invalid lines beginning with line #%d. See tool tips for data requirements." % ( skipped_lines, first_invalid_line )
distEisen = robjects.r(''' distEisen <- function(x, use = "pairwise.complete.obs") { co.x <- cor(x, use = use) dist.co.x <- 1 - co.x return(as.dist(dist.co.x)) } ''') listToDF = robjects.r(''' listToDF <- function(inputList, fill = NA){ # Use fill = NULL for regular recycling behavior maxLen = max(sapply(inputList, length)) for(i in seq_along(inputList)) inputList[[i]] <- c(inputList[[i]], rep(fill, maxLen - length(inputList[[i]]))) return(as.data.frame(inputList)) } ''') annotations = DataFrame.from_csvfile(annotation_classes_input_file, header=True, sep='\t', quote='"', row_names=1) R = robjects.r R["library"]("utils") R["library"]("tools")
def show1(): open1() r.source('D:/Postgraduate/Course/2-semester/R-language/TimeAnalyze/Programe/R/head1.r',encoding="utf-8") data = DataFrame.from_csvfile('D:/Postgraduate/Course/2-semester/R-language/TimeAnalyze/Programe/temp/day1.csv') pp = ggplot2.ggplot(data)+ggplot2.aes_string(x='project', y='time',fill = 'project')+ggplot2.geom_bar(stat ='identity')+ggplot2.ggtitle("今日项目时间分布图")+ggplot2.labs(x='项目',y='时间 (min)')+ggplot2.theme(**{'axis.text.x': ggplot2.element_text(angle = 45)}) pp.plot()
def __init__(self, dex_name): self.dexcom_data = DataFrame.from_csvfile(dex_name)
import rpy2.robjects.pandas2ri from rpy2.robjects.vectors import DataFrame import math import datetime parser = argparse.ArgumentParser() parser.add_argument("-in_csv", help="") parser.add_argument("-out", help="") args=parser.parse_args() dataf = DataFrame.from_csvfile(args.in_csv, sep = ",",header=True) # Get statistics for investigated seqs rmean = robjects.r['mean'] rmed = robjects.r['median'] rmax = robjects.r['max'] rsd = robjects.r['sd'] rsum = robjects.r['sum'] ma=rmax(dataf.rx('hitlen')) as_vec = robjects.r['as.vector'] as_num = robjects.r['as.numeric'] as_mat = robjects.r['as.matrix'] #test22=as_vec(dataf.rx('Length'))
## print pat ## return None # __file.close() # data_from_input = DataFrame.from_csvfile(__file.name) # reg = stats.lm(pat, data_from_input) # st = stats.step(reg, direction = 'backward') ## ret = '' ## for key, value in st.iteritems(): ## ret += key + ',' + str(value) + '\n' ## return ret # return '' # stepwise_regression() d_v = "inflat" i_vs = ["money", "output", "initial", "poprate", "inv", "school"] data_from_input = DataFrame.from_csvfile("/home/foodfan/money.csv") stats = importr("stats") pat = "%s~%s" % (d_v, "+".join(i_vs)) # print pat # return None reg = stats.lm(pat, data_from_input) st = stats.step(reg, direction="backward") ret = "" print "------------------------------------" print str(st[0]) # print st[1] # for key, value in st.iteritems(): # ret += key + ',' + str(value) + '\n' print ret
from rpy2.robjects.vectors import DataFrame import math import datetime parser = argparse.ArgumentParser() parser.add_argument("-in_csv", help="") parser.add_argument("-out", help="") args=parser.parse_args() infile="/Users/security/science/bigoutput.csv" dataf = DataFrame.from_csvfile(infile, sep = ",",header=True) # Get statistics for investigated seqs #rmean = robjects.r['mean'] #rmed = robjects.r['median'] #rmax = robjects.r['max'] #rsd = robjects.r['sd'] #rsum = robjects.r['sum'] # #ma=rmax(dataf.rx('Length')) # #as_vec = robjects.r['as.vector'] #as_num = robjects.r['as.numeric'] #as_mat = robjects.r['as.matrix'] # #test22=as_vec(dataf.rx('Length'))