def readRates( infile ): """read rates and G+C from a tab-separated file.""" from rpy import r as R import rpy handle, name = tempfile.mkstemp() os.close(handle) outfile = open(name, "w") first = True headers = [] for line in infile: if line[0] == "#": continue data = line[:-1].split("\t") if first: headers = data first = False continue outfile.write( line ) outfile.close() assert len(headers) == 3, "malformatted file of rates, please supply id, g+c, rate" rpy.set_default_mode(rpy.NO_CONVERSION) matrix = R.read_table( name, na_string = ("NA", "na"), col_names=headers ) rpy.set_default_mode(rpy.BASIC_CONVERSION) os.remove( name ) return matrix, headers
def randomForest_predict(self, fit_model, data): """ 03-17-06 2006-10-30, add avg_degree(vertex_gradient) and unknown_cut_off """ if self.debug: sys.stderr.write("Predicting by randomForest...\n") data = array(data) set_default_mode(NO_CONVERSION) data_frame = r.as_data_frame( { "p_value": data[:, 0], "recurrence": data[:, 1], "connectivity": data[:, 2], "cluster_size": data[:, 3], "gradient": data[:, 4], "avg_degree": data[:, 5], "unknown_ratio": data[:, 6], "is_correct": r.factor(data[:, -1]), } ) set_default_mode(BASIC_CONVERSION) pred = r.predict(fit_model, data_frame) del data_frame if self.debug: sys.stderr.write("Done randomForest prediction.\n") return pred
def rpart_predict(self, fit_model, data): """ 11-23-05 split from rpart_fit_and_predict() """ if self.debug: sys.stderr.write("Doing rpart_predict...\n") data = array(data) set_default_mode(NO_CONVERSION) data_frame = r.as_data_frame( { "p_value": data[:, 0], "recurrence": data[:, 1], "connectivity": data[:, 2], "cluster_size": data[:, 3], "gradient": data[:, 4], "is_correct": data[:, -1], } ) set_default_mode(BASIC_CONVERSION) pred = r.predict(fit_model, data_frame, type=["class"]) # 11-17-05 type=c("class") del data_frame if self.debug: sys.stderr.write("Done rpart_predict.\n") return pred
def readRates(infile): """read rates and G+C from a tab-separated file.""" from rpy import r as R import rpy handle, name = tempfile.mkstemp() os.close(handle) outfile = open(name, "w") first = True headers = [] for line in infile: if line[0] == "#": continue data = line[:-1].split("\t") if first: headers = data first = False continue outfile.write(line) outfile.close() assert len( headers) == 3, "malformatted file of rates, please supply id, g+c, rate" rpy.set_default_mode(rpy.NO_CONVERSION) matrix = R.read_table(name, na_string=("NA", "na"), col_names=headers) rpy.set_default_mode(rpy.BASIC_CONVERSION) os.remove(name) return matrix, headers
def generateCorStructForGLSFromVarianceMatrix(cls, variance_matrix): """ 2009-12-23 generate the corStruct for gls() """ sys.stderr.write("Generating corStruct for gls() from variance_matrix ...") rpy.set_default_mode(rpy.NO_CONVERSION) # 04-07-05 rpy.r.library("nlme") # bring the lower-triangle of variance_matrix into a list, row by row no_of_rows, no_of_cols = variance_matrix.shape lower_triangle_cor_vector = [] for i in range(1, no_of_rows): for j in range(i): lower_triangle_cor_vector.append( variance_matrix[i][j] / math.sqrt(variance_matrix[i][i] * variance_matrix[j][j]) ) csSymm = rpy.r.corSymm(value=lower_triangle_cor_vector) data_frame = rpy.r.as_data_frame({"fakedata": [1] * no_of_rows}) csSymm = rpy.r.Initialize(csSymm, data=data_frame) rpy.set_default_mode(rpy.BASIC_CONVERSION) sys.stderr.write("Done.\n") return csSymm
def regress(dv, iv): # Performs regression using R's linear model function (lm) if type(dv.values()) is list and all(type(x) is list for x in iv.values()): # First check that all of the data is in list form, otherwise RPy will throw an error rpy.set_default_mode( rpy.NO_CONVERSION) # Keeps values in R format until we need them R_string, frame = make_R_strings( dv, iv) # Create strings used by RPy to run regression # R runs the linear regression OLS_model = eval('rpy.r.lm(R_string, data=rpy.r.data_frame(' + frame + '))') rpy.set_default_mode( rpy.BASIC_CONVERSION) # Now convert back to usable format model_summary = rpy.r.summary(OLS_model) # Store resultss # Extract all of the data of interest coeff = model_summary['coefficients'][:, 0] # Regression coeffecients std_err = model_summary['coefficients'][:, 1] # Standard Errors t_stat = model_summary['coefficients'][:, 2] # t-statistics p_val = model_summary['coefficients'][:, 3] # p-values r_sqr = model_summary['r.squared'] # R-squred asj_r_sqr = model_summary['adj.r.squared'] # Adjusted R-squared return coeff, std_err, t_stat, p_val, r_sqr, asj_r_sqr else: raise TypeError("All variables must all be of type 'list'")
def lm(self, l, h): for i in range(l, h + 1): data_frame, data_model = self.mount_reg_params(i) print data_model rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = r.lm(r(data_model), data=data_frame) rpy.set_default_mode(rpy.BASIC_CONVERSION) print r.summary(linear_model)['r.squared']
def VegetationClassify(Elev_arr, River_arr): rpy.r.library("rpart") # Read the dictionary from the pickle file pkl_file = open('decision_tree.pkl','rb') rpy.set_default_mode(rpy.NO_CONVERSION) traing_data = pickle.load(pkl_file) pkl_file.close() # Create Decision tree for predicting landcover class # create the decision tree using rpart fit = rpy.r.rpart(formula='Class ~ Elevation + RiverDistance + Slope \ + Aspect_x + Aspect_y',data = traing_data, method = "class") # calculate River distance using River_arr River_dist_arr = dist.CityBlock(River_arr) # claculate slope and aspect (Slope_arr, Aspect_arr) = Slope_aspect.Slope_aspect(Elev_arr) (x_len, y_len) = Elev_arr.shape # Alloctae vegetation array for holding predicted landcover values Veg_arr = numpy.zeros((x_len, y_len), dtype = "uint8") # Normalize the elevation data minimum_elev = numpy.min(Elev_arr) factor = numpy.max(Elev_arr) - minimum_elev Elev_arr = (Elev_arr[:,:] - minimum_elev)*100/factor # Create various list to hold test data Elevation = [] Slope = [] RiverDistance = [] Aspect_x = [] Aspect_y = [] # Append the data into respective list for i in range(0,x_len): for j in range(0,y_len): Elevation.append(int(Elev_arr[i][j])) Slope.append(int(Slope_arr[i][j])) RiverDistance.append(int(River_dist_arr[i][j])) Aspect_x.append(int(Aspect_arr[i][j][0])) Aspect_y.append(int(Aspect_arr[i][j][1])) # Create dictionary so as to apply R's predict command on it Test_data ={'Elevation':Elevation ,'Slope':Slope ,'RiverDistance':RiverDistance,\ 'Aspect_x':Aspect_x,'Aspect_y':Aspect_y} rpy.set_default_mode(rpy.BASIC_CONVERSION) # values contain probability values of the predicted landcover classes values = rpy.r.predict(fit,newdata=Test_data,method="class") for i in range(0,x_len): for j in range(0,y_len): # Get the class having max probability for each test data point a = ndimage.maximum_position(values[i*x_len + j]) Veg_arr[i,j] = (a[0]*25) # Assign them some value to facilitate visualization return Veg_arr
def randomForest_fit(self, known_data, parameter_list, bit_string="1111111"): """ 03-17-06 2006-10-302006-10-30, add avg_degree(vertex_gradient) and unknown_cut_off """ if self.debug: sys.stderr.write("Fitting randomForest...\n") mty = parameter_list[0] from rpy import r r._libPaths( os.path.join(lib_path, "R") ) # better than r.library("randomForest", lib_loc=os.path.join(lib_path, "R")) (see plone doc) r.library("randomForest") coeff_name_list = [ "p_value", "recurrence", "connectivity", "cluster_size", "gradient", "avg_degree", "unknown_ratio", ] # 2006-10-30 formula_list = [] for i in range(len(bit_string)): if bit_string[i] == "1": formula_list.append(coeff_name_list[i]) formula = r("is_correct~%s" % "+".join(formula_list)) known_data = array(known_data) set_default_mode(NO_CONVERSION) data_frame = r.as_data_frame( { "p_value": known_data[:, 0], "recurrence": known_data[:, 1], "connectivity": known_data[:, 2], "cluster_size": known_data[:, 3], "gradient": known_data[:, 4], "avg_degree": known_data[:, 5], "unknown_ratio": known_data[:, 6], "is_correct": r.factor(known_data[:, -1]), } ) # 03-17-06, watch r.factor #2006-10-30 if mty > 0: fit = r.randomForest(formula, data=data_frame, mty=mty) else: fit = r.randomForest(formula, data=data_frame) del data_frame if self.debug: sys.stderr.write("Done fitting randomForest.\n") return fit
def interpolazionelineare(x, y): rpy.set_default_mode(rpy.NO_CONVERSION) #serve per l'ultima parte in R linear_model = rpy.r.lm(rpy.r("y ~ x"), data=rpy.r.data_frame(x=x, y=y)) rpy.set_default_mode(rpy.BASIC_CONVERSION) summary = rpy.r.summary(linear_model) #pendenza,errpendenza,intercetta,errintercetta risultati = (summary['coefficients'][0][0], \ summary['coefficients'][0][1], \ summary['coefficients'][1][0], \ summary['coefficients'][1][1]) return risultati
def _setup(self): try: import rpy self.__rpy = rpy self.__rpy_version = 1 except: import rpy2.rpy_classic as rpy rpy.set_default_mode(rpy.BASIC_CONVERSION) self.__rpy = rpy self.__rpy_version = 2 self._process_events() globals()[self.__name] = rpy.r
def estimate_pi0(self, lambda_list, pi0_list): """ 01-19-06 Storey2003, (natural) cubic spline, df=3 """ sys.stderr.write("Estimating pi0...\n") rpy.set_default_mode(rpy.NO_CONVERSION) s = r.smooth_spline(lambda_list, pi0_list, df=3) rpy.set_default_mode(rpy.BASIC_CONVERSION) estimated_pi0 = r.predict(s, 1)['y'] print "\t estimated_pi0:", estimated_pi0 sys.stderr.write("Done.\n") return estimated_pi0
def interpolazionelineare(self, other): """x.interpolazionelineare(y) esegue l'i.l. con x in ascissa e y in ordinata. x e y devono essere due oggetti della classe DatiSperimentali.""" rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = rpy.r.lm(rpy.r("y ~ x"), data = rpy.r.data_frame(x=self.valori, y=other.valori)) rpy.set_default_mode(rpy.BASIC_CONVERSION) summary = rpy.r.summary(linear_model) #pendenza,errpendenza,intercetta,errintercetta risultati = (summary['coefficients'][0][0], \ summary['coefficients'][0][1], \ summary['coefficients'][1][0], \ summary['coefficients'][1][1]) return risultati
def estimate_pi0(self, lambda_list, pi0_list): """ 01-19-06 Storey2003, (natural) cubic spline, df=3 """ sys.stderr.write("Estimating pi0...\n") rpy.set_default_mode(rpy.NO_CONVERSION) s = r.smooth_spline(lambda_list, pi0_list, df=3) rpy.set_default_mode(rpy.BASIC_CONVERSION) estimated_pi0 = r.predict(s,1)['y'] print "\t estimated_pi0:", estimated_pi0 sys.stderr.write("Done.\n") return estimated_pi0
def interpolazionelineare(self, other): rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = rpy.r.lm(rpy.r("y ~ x"), data=rpy.r.data_frame(x=self.valori, y=other.valori)) rpy.set_default_mode(rpy.BASIC_CONVERSION) summary = rpy.r.summary(linear_model) #pendenza,errpendenza,intercetta,errintercetta risultati = (summary['coefficients'][0][0], \ summary['coefficients'][0][1], \ summary['coefficients'][1][0], \ summary['coefficients'][1][1]) return risultati
def _train(self, dataset): """Train the classifier using `data` (`Dataset`). """ # process the labels based on the model family if self.params.family == 'gaussian': # do nothing, just save the labels as a list labels = dataset.labels.tolist() pass elif self.params.family == 'multinomial': # turn lables into list of range values starting at 1 labels = _label2indlist(dataset.labels, dataset.uniquelabels) self.__ulabels = dataset.uniquelabels.copy() # process the pmax if self.params.pmax is None: # set it to the num features pmax = dataset.nfeatures else: # use the value pmax = self.params.pmax # train with specifying max_steps # must not convert trained model to dict or we'll get segfault rpy.set_default_mode(rpy.NO_CONVERSION) self.__trained_model = rpy.r.glmnet(dataset.samples, labels, family=self.params.family, alpha=self.params.alpha, nlambda=self.params.nlambda, standardize=self.params.standardize, thresh=self.params.thresh, pmax=pmax, maxit=self.params.maxit, type=self.params.model_type) rpy.set_default_mode(rpy.NO_DEFAULT) # get a dict version of the model self.__trained_model_dict = rpy.r.as_list(self.__trained_model) # save the lambda of last step self.__last_lambda = self.__trained_model_dict['lambda'][-1] # set the weights to the last step weights = rpy.r.coef(self.__trained_model, s=self.__last_lambda) if self.params.family == 'multinomial': self.__weights = N.hstack([rpy.r.as_matrix(weights[str(i)])[1:] for i in range(1,len(self.__ulabels)+1)]) elif self.params.family == 'gaussian': self.__weights = rpy.r.as_matrix(weights)[1:]
def make_L(data,direction='S',z=None,): """ Define the along track distance from one reference direction define the cardinal direction priority (N,S,W or E). S means that the reference will be the southern most point z define the bathymetry, if defined, the closest point to that bathymetry will be the reference. In case of cross this bathymetry more than once, the direction criteria is used to distinguish. """ from fluid.common.distance import distance all_cycles_data = join_cycles(data) if z==None: import rpy #for t in topex.invert_keys(data): for t in all_cycles_data: rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = rpy.r.lm(rpy.r("y ~ x"), data = rpy.r.data_frame(x=all_cycles_data[t]['Longitude'], y=all_cycles_data[t]['Latitude'])) rpy.set_default_mode(rpy.BASIC_CONVERSION) coef=rpy.r.coef(linear_model) if direction=='S': lat0=all_cycles_data[t]['Latitude'].min()-1 lon0 = (lat0-coef['(Intercept)'])/coef['x'] L_correction = distance(all_cycles_data[t]['Latitude'],all_cycles_data[t]['Longitude'],lat0,lon0).min() for c in invert_keys(data)[t]: data[c][t]['L'] = distance(data[c][t]['Latitude'],data[c][t]['Longitude'],lat0,lon0)- L_correction # This bathymetric method was only copied from an old code. This should be atleast # changed, if not removed. elif method=='bathymetric': import rpy for t in all_cycles_data: # First define the near coast values. idSouth=numpy.argmin(all_cycles_data[t]['Latitude']) L_tmp = distance(all_cycles_data[t]['Latitude'],all_cycles_data[t]['Longitude'],all_cycles_data[t]['Latitude'][idSouth],all_cycles_data[t]['Longitude'][idSouth]) idNearCoast = L_tmp.data<400e3 if min(all_cycles_data[t]['Bathy'][idNearCoast]) > -z: idNearCoast = L_tmp.data<600e3 # Then calculate the distance to a reference rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = rpy.r.lm(rpy.r("y ~ x"), data = rpy.r.data_frame(x=all_cycles_data[t]['Longitude'], y=all_cycles_data[t]['Latitude'])) rpy.set_default_mode(rpy.BASIC_CONVERSION) coef=rpy.r.coef(linear_model) lat0 = all_cycles_data[t]['Latitude'].min()-1 lon0 = (lat0-coef['(Intercept)'])/coef['x'] #L = distance(,lon,lat0,lon0) # #id0 = numpy.argmin(numpy.absolute(all_cycles_data[t]['Bathy'][idNearCoast])) idref=numpy.argmin(numpy.absolute(all_cycles_data[t]['Bathy'][idNearCoast]+z)) #L_correction = distance(all_cycles_data[t]['Latitude'][idNearCoast][idref],all_cycles_data[t]['Longitude'][idNearCoast][idref],all_cycles_data[t]['Latitude'][idNearCoast][idref],all_cycles_data[t]['Longitude'][idNearCoast][idref]) L_correction = distance(all_cycles_data[t]['Latitude'][idNearCoast][idref],all_cycles_data[t]['Longitude'][idNearCoast][idref],lat0,lon0) for c in topex.invert_keys(data)[t]: #data[c][t]['L'] = distance(data[c][t]['Latitude'],data[c][t]['Longitude'],all_cycles_data[t]['Latitude'][idNearCoast][id0],all_cycles_data[t]['Longitude'][idNearCoast][id0]) - L_correction data[c][t]['L'] = distance(data[c][t]['Latitude'],data[c][t]['Longitude'],lat0,lon0) - L_correction # return
def pure_linear_model_via_R(cls, non_NA_genotype_ls, non_NA_phenotype_ls, non_NA_phenotype2count=None): """ 2010-2-25 use createDesignMatrix() to generate a design matrix 2009-8-28 split out of pure_linear_model(). same functionality as pure_linear_model(), but invoke R to run regression. """ genotype_matrix = cls.createDesignMatrix(non_NA_genotype_ls) # 2008-11-10 do linear regression by R genotype_var = numpy.var(genotype_matrix[:, 0]) # 2008-11-10 var=\sum(x_i-\bar{x})^2/(n-1) rpy.set_default_mode(rpy.NO_CONVERSION) # 04-07-05 # data_frame = rpy.r.as_data_frame({"phenotype":non_NA_phenotype_ls, "genotype":rpy.r.as_factor(genotype_matrix[:,1])}) formula_list = [] data_frame_dict = {"phenotype": non_NA_phenotype_ls} for i in range(genotype_matrix.shape[1]): var_name = "genotype%s" % i formula_list.append(var_name) data_frame_dict.update({var_name: genotype_matrix[:, i]}) data_frame = rpy.r.as_data_frame(data_frame_dict) formula = "phenotype~%s" % "+".join(formula_list) if non_NA_phenotype2count and len(non_NA_phenotype2count) == 2: # binary phenotype, use logistic regression lm_result = rpy.r.glm(rpy.r(formula), data=data_frame, family=rpy.r("binomial")) else: lm_result = rpy.r.glm(rpy.r(formula), data=data_frame) rpy.set_default_mode(rpy.BASIC_CONVERSION) # 04-07-05 r.summary() requires lm_result in NO_CONVERSION state summary_stat = rpy.r.summary(lm_result) # 06-30-05 index 0 in summary_stat['coefficients'] is intercept coeff_list = [] coeff_p_value_list = [] for i in range(len(summary_stat["coefficients"])): coeff_list.append(summary_stat["coefficients"][i][0]) # 0 is the coefficient coeff_p_value_list.append(summary_stat["coefficients"][i][-1]) # -1 is the corresponding p-value # 06-30-05 fill in other efficients based on bit_string, NOTE i+1 pvalue = coeff_p_value_list[1] residuals = summary_stat["deviance"] geno_effect_var = genotype_var * coeff_list[1] * coeff_list[1] * (no_of_rows - 1) var_perc = geno_effect_var / (residuals + geno_effect_var) pdata = PassingData( pvalue=pvalue, var_perc=var_perc, coeff_list=coeff_list, coeff_p_value_list=coeff_p_value_list ) return pdata
def plot(self,filename=None,format='pdf',**kwargs): """Plot the heatmap and save to an image file. plot() # display using windowing system plot('hm') # --> hm.pdf plot('hm.png') # --> hm.png plot('hm','png') # --> hm.png By default a clustered heat map is constructed using R's heatmap.2 function. If R cannot be found, an unclustered heat map is plotted. **kwargs can be used to customize the output. :Arguments: filename name of the image file; may contain extension If empty use the windowing system. format eps,pdf,png... whatever matplotlib understands **kwargs for R: scale Determines the coloring. Choose between 'none' (the actual values in the heat map (possibly already normalized)), 'row' or 'column' (z-score across the dimension) N_colors Number of color levels; default is 32. **kwargs for matplotlib: The kwargs are applied to the matplotlib.text() method and are typically used to set font properties. See the pylab/matplotlib documentation. """ if filename: format = hop.utilities.fileextension(filename,default=format) labels = self.labels() try: try: import rpy except ImportError: from rpy2 import rpy_classic as rpy # http://www.mail-archive.com/[email protected]/msg01893.html rpy.set_default_mode(rpy.BASIC_CONVERSION) self._heatmap_R(labels,filename=filename,format=format,**kwargs) except ImportError: msg(0,"rpy package missing: cannot plot clustered heat map, defaulting to " "an unclustered heat map") self._heatmap_matplotlib(labels,filename=filename,format=format,**kwargs) if filename: msg(1,"Wrote image to file %s.\n" % self.filename(filename,ext=format))
def check_R(model,g): import rpy from rpy import r from numpy import array,allclose vars = [ v.replace(':','.').replace('+','p').replace('-','m').replace('_','.') for v in model.vars[1:] ] frame = dict( (v,model.X[:,i+1].reshape(-1)) for i,v in enumerate(vars) ) frame['y'] = model.y.reshape(-1) formula = 'y ~ ' + ' + '.join(v.replace(':','.') for v in vars) rpy.set_default_mode(rpy.NO_CONVERSION) mod = r.glm(r(formula),data=r.data_frame(**frame),family=r.binomial('logit')) rpy.set_default_mode(rpy.BASIC_CONVERSION) pmod = mod.as_py() coef = r.coefficients(mod) coef = array([coef['(Intercept)']] + [ coef[v] for v in vars ],dtype=float) coef2 = g.beta.reshape(-1)
def _predict(self, data): """ Predict the output for the provided data. """ # predict with standard method values = rpy.r.predict(self.__trained_model, newx=data, type='link', s=self.__last_lambda) # predict with the final state (i.e., the last step) classes = None if self.params.family == 'multinomial': # remove last dimension of values values = values[:,:,0] # get the classes too (they are 1-indexed) rpy.set_default_mode(rpy.NO_CONVERSION) class_ind = rpy.r.predict(self.__trained_model, newx=data, type='class', s=self.__last_lambda) rpy.set_default_mode(rpy.NO_DEFAULT) class_ind = rpy.r.as_vector(class_ind) # convert the strings to ints and subtract 1 class_ind = N.array([int(float(c))-1 for c in class_ind]) # convert to actual labels classes = self.__ulabels[class_ind] else: # is gaussian, so just remove last dim of values values = values[:,0] # values need to be set anyways if values state is enabled self.values = values if classes is not None: # set the values and return none return classes else: # return the values as predictions return values
def _mcmc_betas_same_sources(self, tag_list): """ The given tag_list contains tags that all have the same features available. Train on the tags in tag_list using only the songs in self.only_these_songs, or all available songs if self.only_these_songs is None. """ if not self.production_run: self.mcmc_reps = 75 # save time rc.library("bayesm") data = [] for tag in tag_list: data.append(rc.list(X=self.X[tag],y=self.y[tag])) rpy.set_default_mode(rpy.NO_CONVERSION) # Turn off conversion so that lm returns Robj. data = rc.list(*data) if self.regtype in ["Hierarchical Linear", "Hierarchical Mixture"]: Data = rc.list(regdata=data) elif self.regtype=="Hierarchical Logistic": Data = rc.list(lgtdata=data) if self.regtype=="Hierarchical Mixture": Prior = rc.list(ncomp=self.ncomp) Mcmc=rc.list(R=self.mcmc_reps) rpy.set_default_mode(rpy.BASIC_CONVERSION) try: if self.regtype=="Hierarchical Linear": output = rc.rhierLinearModel(Data=Data,Mcmc=Mcmc) elif self.regtype=="Hierarchical Logistic": output = rc.rhierBinLogit(Data=Data,Mcmc=Mcmc) elif self.regtype=="Hierarchical Mixture": output = rc.rhierLinearMixture(Data=Data,Prior=Prior,Mcmc=Mcmc) except: #pdb.set_trace() self._info_about_r_error(tag_list) return beta_matrix = output['betadraw'].mean(axis=2) # nregressions x ncoeffs, averaged along third dim matrix_index = 0 for tag in tag_list: cur_tag_beta_vec = beta_matrix[matrix_index,:] beta_dict_list = [dict([('beta', coeff)]) for coeff in cur_tag_beta_vec] self.beta[tag] = dict(zip(self.sorted_sources[tag],beta_dict_list)) self.stats[tag] = dict() # I'm not currently storing any stats for hierarchical regressions. matrix_index += 1
def _independent_betas_same_sources(self, tag_list, remove_tags_when_bad_regression, n_times_show_summary=3): times_showed_summary = 0 # This allows us to print out some summary statistics without producing an overwhelming amount of output. SUMMARY_STATS = ["beta", "stderr", "tstat", "pval"] for tag in tag_list: self._progress("Computing betas for tag %s." % tag, newline=True) # rmme: newline make false rpy.set_default_mode(rpy.NO_CONVERSION) # Turn off conversion so that lm returns Robj. data = rc.list(y=self.y[tag],X=self.X[tag]) model = "y~X-1" # Use -1 because X has an intercept already if self.regtype=="Independent Linear": try: result = rc.lm(model,data=data) except: pdb.set_trace() elif self.regtype=="Independent Logistic": result = rc.glm(model,family=rc.binomial("logit"),data=data) rpy.set_default_mode(rpy.BASIC_CONVERSION) # Return to normal conversion mode. summary = rc.summary(result,correlation=rc.TRUE) self._record_regression_stats(tag, summary) beta_dict = dict() sorted_sources = self.sorted_sources[tag] coeff_matrix = summary["coefficients"] for i in range(len(sorted_sources)): try: cur_source_dict = dict(zip(SUMMARY_STATS,coeff_matrix[i,:])) except IndexError: util.info("\tWARNING: Regression for %s didn't end up using all variables." % tag) if remove_tags_when_bad_regression: self._remove_tag(tag) break # break from for-loop over sorted_sources; we don't continue out of the per-tag for loop until later when we check if tag is in self.features.... continue try: cur_source_dict["-log10(pval)"] = -log(cur_source_dict["pval"], 10) except OverflowError: pass beta_dict[sorted_sources[i]] = cur_source_dict if tag not in self.features: # We've removed this tag a few lines above, so skip it. continue self.beta[tag] = beta_dict if times_showed_summary < n_times_show_summary: self._print_regression_summary(tag, summary) times_showed_summary += 1
def regress(dv,iv): # Performs regression using R's linear model function (lm) if type(dv.values()) is list and all(type(x) is list for x in iv.values()): # First check that all of the data is in list form, otherwise RPy will throw an error rpy.set_default_mode(rpy.NO_CONVERSION) # Keeps values in R format until we need them R_string,frame=make_R_strings(dv,iv) # Create strings used by RPy to run regression # R runs the linear regression OLS_model=eval('rpy.r.lm(R_string, data=rpy.r.data_frame('+frame+'))') rpy.set_default_mode(rpy.BASIC_CONVERSION) # Now convert back to usable format model_summary=rpy.r.summary(OLS_model) # Store resultss # Extract all of the data of interest coeff=model_summary['coefficients'][:,0] # Regression coeffecients std_err=model_summary['coefficients'][:,1] # Standard Errors t_stat=model_summary['coefficients'][:,2] # t-statistics p_val=model_summary['coefficients'][:,3] # p-values r_sqr=model_summary['r.squared'] # R-squred asj_r_sqr=model_summary['adj.r.squared'] # Adjusted R-squared return coeff,std_err,t_stat,p_val,r_sqr,asj_r_sqr else: raise TypeError("All variables must all be of type 'list'")
def rpart_fit_and_predict(self, all_data, known_data, rpart_cp, loss_matrix, prior_prob, bit_string='11111'): """ 11-09-05 1st use known_data to get the fit model 2nd use the fit model to do prediction on all_data, result is prob for each class 11-09-05 add rpart_cp 11-17-05 add loss_matrix, prior_prob return two pred """ sys.stderr.write("rpart fitting and predicting...\n") r.library("rpart") coeff_name_list = ['p_value', 'recurrence', 'connectivity', 'cluster_size', 'gradient'] formula_list = [] for i in range(len(bit_string)): if bit_string[i] == '1': formula_list.append(coeff_name_list[i]) #11-17-05 transform into array all_data = array(all_data) known_data = array(known_data) set_default_mode(NO_CONVERSION) data_frame = r.as_data_frame({"p_value":known_data[:,0], "recurrence":known_data[:,1], "connectivity":known_data[:,2], \ "cluster_size":known_data[:,3], "gradient":known_data[:,4], "is_correct":known_data[:,-1]}) if prior_prob: prior_prob = [prior_prob, 1-prior_prob] #get the full list fit = r.rpart(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp),\ parms=r.list(prior=prior_prob, loss=r.matrix(loss_matrix) ) ) else: fit = r.rpart(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp),\ parms=r.list(loss=r.matrix(loss_matrix) ) ) set_default_mode(BASIC_CONVERSION) pred_training = r.predict(fit, data_frame, type=["class"]) del data_frame set_default_mode(NO_CONVERSION) all_data_frame = r.as_data_frame({"p_value":all_data[:,0], "recurrence":all_data[:,1], "connectivity":all_data[:,2], \ "cluster_size":all_data[:,3], "gradient":all_data[:,4], "is_correct":all_data[:,-1]}) set_default_mode(BASIC_CONVERSION) pred = r.predict(fit, all_data_frame, type=["class"]) #11-17-05 type=c("class") del all_data_frame sys.stderr.write("Done rpart fitting and predicting.\n") return pred, pred_training
# if you have rpy installed, use it to test the results have_rpy = False try: print("\n") print("="*30) print("Validating OLS results in R") print("="*30) import rpy have_rpy = True except ImportError: print("\n") print("="*30) print("Validating OLS-class results in R") print("="*30) print("rpy is not installed") print("="*30) if have_rpy: y = data[:,0] x1 = data[:,1] x2 = data[:,2] x3 = data[:,3] x4 = data[:,4] rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = rpy.r.lm(rpy.r("y ~ x1 + x2 + x3 + x4"), data = rpy.r.data_frame(x1=x1,x2=x2,x3=x3,x4=x4,y=y)) rpy.set_default_mode(rpy.BASIC_CONVERSION) print(linear_model.as_py()['coefficients']) summary = rpy.r.summary(linear_model) print(summary)
def __init__(self, y, design, model_type=r.lm, **kwds): """ Set up and estimate R model with data and design """ r.library("MASS") # still needs to be in test, but also here for # logical tests at the end not to show an error self.y = np.array(y) self.design = np.array(design) self.model_type = model_type self._design_cols = ["x.%d" % (i + 1) for i in range(self.design.shape[1])] # Note the '-1' for no intercept - this is included in the design self.formula = r("y ~ %s-1" % "+".join(self._design_cols)) self.frame = r.data_frame(y=y, x=self.design) rpy.set_default_mode(rpy.NO_CONVERSION) results = self.model_type(self.formula, data=self.frame, **kwds) self.robj = results # keep the Robj model so it can be # used in the tests rpy.set_default_mode(rpy.BASIC_CONVERSION) rsum = r.summary(results) self.rsum = rsum # Provide compatible interface with scipy models self.results = results.as_py() # coeffs = self.results['coefficients'] # self.beta0 = np.array([coeffs[c] for c in self._design_cols]) self.nobs = len(self.results["residuals"]) if isinstance(self.results["residuals"], dict): self.resid = np.zeros((len(list(self.results["residuals"].keys())))) for i in list(self.results["residuals"].keys()): self.resid[int(i) - 1] = self.results["residuals"][i] else: self.resid = self.results["residuals"] self.fittedvalues = self.results["fitted.values"] self.df_resid = self.results["df.residual"] self.params = rsum["coefficients"][:, 0] self.bse = rsum["coefficients"][:, 1] self.bt = rsum["coefficients"][:, 2] try: self.pvalues = rsum["coefficients"][:, 3] except: pass self.rsquared = rsum.setdefault("r.squared", None) self.rsquared_adj = rsum.setdefault("adj.r.squared", None) self.aic_R = rsum.setdefault("aic", None) self.fvalue = rsum.setdefault("fstatistic", None) if self.fvalue and isinstance(self.fvalue, dict): self.fvalue = self.fvalue.setdefault("value", None) # for wls df = rsum.setdefault("df", None) if df: # for RLM, works for other models? self.df_model = df[0] - 1 # R counts intercept self.df_resid = df[1] self.bcov_unscaled = rsum.setdefault("cov.unscaled", None) self.bcov = rsum.setdefault("cov.scaled", None) if "sigma" in rsum: self.scale = rsum["sigma"] elif "dispersion" in rsum: self.scale = rsum["dispersion"] else: self.scale = None self.llf = r.logLik(results) if model_type == r.glm: self.getglm() if model_type == r.rlm: self.getrlm()
from rpy import r, set_default_mode, NO_CONVERSION, PROC_CONVERSION import numpy as N from scipy.interpolate import interp1d from scipy.ndimage import gaussian_filter1d from ppgplot_spb import * import gaussian set_default_mode(PROC_CONVERSION) def bootdensity(data, min, max, nboot, ci): """ Calculate density and confidence intervals on density for a 1D array of points. Bandwidth is selected automatically. """ r(""" limdensity <- function(data, weights=NULL, bw="nrd0") { density(data, from=%f, to=%f, weights=weights, bw=bw) } """%(min, max)) density = r.limdensity(data) xdens = N.array(density['x']) ydens = N.array(density['y']) bw = density['bw'] #print 'bandwidth:', bw ydensboot = N.zeros((nboot, len(xdens)), N.float) ndata = len(data) ran = N.random.uniform(0, ndata, (nboot,ndata)).astype(N.int) for i in range(nboot): den = r.limdensity(data[ran[i]])
def DecisionTree(output_dir, elev_filename, landcover_filename, river_filename): """ This module generate decision tree used to allocate landcover classes. It imports rpart library from rpy package. Reads the training data, creates a sample data and use rpart libray to build decision tree. """ rpy.r.library("rpart") # rpart library used for creating Decision tree # Read Elevation Data from ascii file file_name = "training_data/%s" % (elev_filename) Elev_arr = numpy.loadtxt(file_name, unpack=True) # Read Landcover Data from ascii file file_name = "training_data/%s" % (landcover_filename) Landcover = numpy.loadtxt(file_name, unpack=True) # Read River Data from ascii file file_name = "training_data/%s" % (river_filename) River = numpy.loadtxt(file_name, unpack=True) # Compute City block distance from River data River_dist_arr = city_block_dist.CityBlock(River) # Compute Slope and Aspect from Elevation data (Slope_arr, Aspect_arr) = Slope_aspect.Slope_aspect(Elev_arr) (x_len, y_len) = Elev_arr.shape no_of_veg_class = 10 # no of vegetation class in Landcover matrix # Generating Lists for differnt Landcover classes # Create list of lists to hold pixels of each landcover class - no of list in # list L is equal to no_of_veg_class L = [] for i in range(0, no_of_veg_class): L.append([]) # Now append the pixel co-ordinates into respective list of lists for i in range(1, x_len - 1): # Ignoring boundary cells for j in range(1, y_len - 1): # because we don't have slope and aspect for them # nodata values already gets handled since we are ignoring it if Landcover[i][j] == 0: L[0].append((i, j)) elif Landcover[i][j] == 1: L[1].append((i, j)) elif Landcover[i][j] == 2: L[2].append((i, j)) elif Landcover[i][j] == 3: L[3].append((i, j)) elif Landcover[i][j] == 4: L[4].append((i, j)) elif Landcover[i][j] == 5: L[5].append((i, j)) elif Landcover[i][j] == 6: L[6].append((i, j)) elif Landcover[i][j] == 7: L[7].append((i, j)) elif Landcover[i][j] == 8: L[8].append((i, j)) elif Landcover[i][j] == 9: L[9].append((i, j)) # Sample Data for decision tree # normalizing elevation data minimum_elev = numpy.min(Elev_arr) factor = numpy.max(Elev_arr) - minimum_elev Elev_arr = (Elev_arr[:, :] - minimum_elev) * 100 / factor # Create various list to hold sample training data Elevation = [] Slope = [] RiverDistance = [] Aspect_x = [] Aspect_y = [] Class = [] # Now sampling the data for i in range(0, no_of_veg_class): if len(L[i]) < 500: limit = len(L[i]) else: limit = 500 for j in range(0, limit): Elevation.append(int(Elev_arr[L[i][j][0]][L[i][j][1]])) Slope.append(int(Slope_arr[L[i][j][0]][L[i][j][1]])) RiverDistance.append(int(River_dist_arr[L[i][j][0]][L[i][j][1]])) Aspect_x.append(int(Aspect_arr[L[i][j][0]][L[i][j][1]][0])) Aspect_y.append(int(Aspect_arr[L[i][j][0]][L[i][j][1]][1])) Class.append(i) # create dictionary of sample data which will be needed to generate decision tree traing_data = { "Elevation": Elevation, "Slope": Slope, "RiverDistance": RiverDistance, "Aspect_x": Aspect_x, "Aspect_y": Aspect_y, "Class": Class, } # write dictionary into pickle file for further use(reusability) output = open("decision_tree.pkl", "wb") pickle.dump(traing_data, output) output.close() rpy.set_default_mode(rpy.NO_CONVERSION) print "Creating Decision tree" # Using rpart create the decision tree fit = rpy.r.rpart( formula="Class ~ Elevation + RiverDistance + Slope + Aspect_x + Aspect_y", data=traing_data, method="class" ) # output a png image of the decision tree file_name = "%s/DecisionTree.png" % (output_dir) rpy.r.png(file_name) rpy.r.plot(fit) rpy.r.text(fit) rpy.r.dev_off()
def calc_stratified_rates(summset, popset, conflev=0.95, basepop=100000, timeinterval='years', ci_method='dobson', popset_popcol='_freq_', debug=False): """ Calculate stratified population rates summset is a straified summary dataset of counts of events for the population-of-interest popset is the stratified population counts for the population-of-interest """ from rpy import r, get_default_mode, set_default_mode, BASIC_CONVERSION alpha = get_alpha(conflev) if ci_method not in ('dobson', 'ff'): raise Error('Only Dobson et al. (dobson) and Fay-Feuer (ff) ' 'methods for confidence intervals currently ' 'implemented') if not popset.has_column(popset_popcol): raise Error('Denominator population dataset %r does not have a ' '%r column' % (popset.label or popset.name, popset_popcol)) st = time.time() r_mode = get_default_mode() try: set_default_mode(BASIC_CONVERSION) # We turn the summset into an Ncondcols-dimensional matrix summtab = CrossTab.from_summset(summset) # The population dataset must have at least as many dimensions as # summary dataset. Any additional axes are eliminated by summing. # any missing axes are created by replication. poptab = CrossTab.from_summset(popset, shaped_like=summtab) poptab.collapse_axes_not_in(summtab) poptab.replicate_axes(summtab) popfreq = poptab[popset_popcol].data.astype(Numeric.Float64) # Manufacture a CrossTab for the result result = summtab.empty_copy() basepop = float(basepop) for table, name, n_add, l_add in just_freq_tables(summtab): # avoid integer overflows... summfreq = table.data.astype(Numeric.Float64) strata_rate = summfreq / popfreq result.add_table('summfreq' + n_add, data=summfreq, label='Events' + l_add) result.add_table('popfreq' + n_add, data=popfreq, label='Person-' + timeinterval + ' at risk' + l_add) result.add_table('sr' + n_add, data=strata_rate * basepop, label='Strata-specific Rate per ' + '%d' % basepop + ' person-' + timeinterval + l_add) if alpha is not None: # CIs for stratified rates summfreq_shape = summfreq.shape summfreq_flat = MA.ravel(summfreq) assert popfreq.shape == summfreq.shape popfreq_flat = MA.ravel(popfreq) sr_ll = Numeric.empty(len(summfreq_flat), typecode=Numeric.Float64) sr_ul = Numeric.empty(len(summfreq_flat), typecode=Numeric.Float64) sr_ll_mask = Numeric.zeros(len(summfreq_flat), typecode=Numeric.Int8) sr_ul_mask = Numeric.zeros(len(summfreq_flat), typecode=Numeric.Int8) for i, v in enumerate(summfreq_flat): try: if v == 0: sr_ll[i] = 0.0 else: sr_ll[i] = ( (r.qchisq(alpha / 2., df=2.0 * v) / 2.0) / popfreq_flat[i]) * basepop sr_ul[i] = ( (r.qchisq(1. - alpha / 2., df=2.0 * (v + 1)) / 2.0) / popfreq_flat[i]) * basepop except: sr_ll[i] = 0.0 sr_ul[i] = 0.0 sr_ll_mask[i] = 1 sr_ul_mask[i] = 1 sr_ll = MA.array(sr_ll, mask=sr_ll_mask, typecode=MA.Float64) sr_ul = MA.array(sr_ul, mask=sr_ul_mask, typecode=MA.Float64) sr_ll.shape = summfreq_shape sr_ul.shape = summfreq_shape sr_base = 'Stratified rate %s%%' % (100.0 * conflev) result.add_table('sr_ll' + n_add, data=sr_ll, label=sr_base + ' lower confidence limit ' + l_add) result.add_table('sr_ul' + n_add, data=sr_ul, label=sr_base + ' upper confidence limit ' + l_add) finally: set_default_mode(r_mode) soom.info('calc_stratified_rates took %.03f' % (time.time() - st)) name = 'stratified_rates_' + summset.name label = 'Stratified Rates for ' + (summset.label or summset.name) if conflev: label += ' (%g%% conf. limits)' % (conflev * 100) if debug: global vars vars = Vars(locals()) return result.to_summset(name, label=label)
def gls_via_R(cls, non_NA_genotype_ls, non_NA_phenotype_ls, non_NA_phenotype2count=None, variance_matrix=None): """ 2009-12-23 general least square model via calling equivalent function in R. """ genotype_matrix = cls.createDesignMatrix(non_NA_genotype_ls) # no need to add a constant vector. if hasattr(cls, "corStruct"): corStruct = cls.corStruct else: if variance_matrix is not None: corStruct = cls.generateCorStructForGLSFromVarianceMatrix(variance_matrix) setattr(cls, "corStruct", corStruct) else: corStruct = None # 2008-11-10 do linear regression by R genotype_var = numpy.var(genotype_matrix[:, 0]) # 2008-11-10 var=\sum(x_i-\bar{x})^2/(n-1) rpy.set_default_mode(rpy.NO_CONVERSION) # 04-07-05 rpy.r.library("nlme") # data_frame = rpy.r.as_data_frame({"phenotype":non_NA_phenotype_ls, "genotype":rpy.r.as_factor(genotype_matrix[:,1])}) formula_list = [] data_frame_dict = {"phenotype": non_NA_phenotype_ls} for i in range(genotype_matrix.shape[1]): var_name = "genotype%s" % i formula_list.append(var_name) data_frame_dict.update({var_name: genotype_matrix[:, i]}) data_frame = rpy.r.as_data_frame(data_frame_dict) formula = "phenotype~%s" % "+".join(formula_list) lm_result = rpy.r.gls(rpy.r(formula), data=data_frame, correlation=corStruct) rpy.set_default_mode(rpy.BASIC_CONVERSION) # 04-07-05 r.summary() requires lm_result in NO_CONVERSION state summary_stat = rpy.r.summary(lm_result) rpy.set_default_mode(rpy.NO_CONVERSION) summary_stat1 = rpy.r.summary(lm_result) rpy.set_default_mode(rpy.VECTOR_CONVERSION) summary_stat2 = rpy.r.summary(lm_result) rpy.set_default_mode(rpy.TOP_CONVERSION) summary_stat3 = rpy.r.summary(lm_result) # 06-30-05 index 0 in summary_stat['coefficients'] is intercept coeff_list = [] coeff_p_value_list = [] for i in range(len(summary_stat["coefficients"])): coeff_list.append(summary_stat["coefficients"][i][0]) # 0 is the coefficient coeff_p_value_list.append(summary_stat["coefficients"][i][-1]) # -1 is the corresponding p-value # 06-30-05 fill in other efficients based on bit_string, NOTE i+1 pvalue = coeff_p_value_list[1] residuals = summary_stat["deviance"] geno_effect_var = genotype_var * coeff_list[1] * coeff_list[1] * (no_of_rows - 1) var_perc = geno_effect_var / (residuals + geno_effect_var) pdata = PassingData( pvalue=pvalue, var_perc=var_perc, coeff_list=coeff_list, coeff_p_value_list=coeff_p_value_list ) return pdata
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: data2phylocontrasts.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-c", "--columns", dest="columns", type="string", help="columns to take for calculating histograms.") parser.add_option("-t", "--filename-tree", dest="filename_tree", type="string", help="filename with tree(s).") parser.add_option("--skip-header", dest="add_header", action="store_false", help="do not add header to flat format.") parser.add_option("--write-header", dest="write_header", action="store_true", help="write header and exit.") parser.add_option("--debug", dest="debug", action="store_true", help="debug mode") parser.add_option("--display-tree", dest="display_tree", action="store_true", help="display the tree") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("contrasts", "spearman", "pearson", "compute"), help="methods to perform on contrasts.") parser.set_defaults( columns="all", filename_tree=None, add_header=True, write_header=False, debug=False, methods=[], value_format="%6.4f", pvalue_format="%e", display_tree=False, ) (options, args) = E.Start(parser, quiet=True) if options.columns not in ("all", "all-but-first"): options.columns = map(lambda x: int(x) - 1, options.columns.split(",")) phylip = WrapperPhylip.Phylip() if options.debug: phylip.setLogLevel(options.loglevel) phylip.setProgram("contrast") ########################################################## ########################################################## ########################################################## # retrieve data and give to phylip data = [] headers = [] first = True for line in sys.stdin: if line[0] == "#": continue d = line[:-1].strip().split("\t") if first: first = False headers = d[1:] continue data.append(d) phylip.setData(data) ncolumns = len(headers) nrows = len(data) ########################################################## ########################################################## ########################################################## # read trees nexus = None if options.filename_tree: nexus = TreeTools.Newick2Nexus(open(options.filename_tree, "r")) if not nexus: raise ValueError("please provide trees with branchlenghts") ########################################################## ########################################################## ########################################################## # set up phylip phylip_options = [] # print out contrasts phylip_options.append("C") phylip_options.append("Y") phylip.setOptions(phylip_options) ########################################################## ########################################################## ########################################################## # main loop ########################################################## for tree in nexus.trees: if options.display_tree: tree.display() # compute this before giving the tree to the phylip module, # as it remaps taxon names. map_node2data = {} for x in range(nrows): taxon = data[x][0] map_node2data[tree.search_taxon(taxon)] = x phylip.setTree(tree) result = phylip.run() for method in options.methods: if method in ("pearson", "spearman"): options.stdout.write("header1\theader2\tr\tp\tcode\n") n = len(result.mContrasts) columns = [] for c in range(ncolumns): columns.append(map(lambda x: x[c], result.mContrasts)) for x in range(0, ncolumns - 1): for y in range(x + 1, ncolumns): # phylip value phy_r = result.mCorrelations[x][y] import rpy from rpy import r as R # Various ways to calculate r. It is not possible to use # cor.test or lsfit directly, as you have to perform a # regression through the origin. # uncomment to check pearson r against phylip's value ## r = calculateCorrelationCoefficient( columns[x], columns[y] ) # for significance, use linear regression models in R rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = R.lm(R("y ~ x - 1"), data=R.data_frame(x=columns[x], y=columns[y])) rpy.set_default_mode(rpy.BASIC_CONVERSION) ss = R.summary(linear_model) # extract the p-value p = ss['coefficients'][-1][-1] if p < 0.001: code = "***" elif p < 0.01: code = "**" elif p < 0.05: code = "*" else: code = "" options.stdout.write("\t".join( (headers[x], headers[y], options.value_format % phy_r, options.pvalue_format % p, code)) + "\n") elif method == "contrasts": options.stdout.write("\t".join(headers) + "\n") for d in result.mContrasts: options.stdout.write( "\t".join(map(lambda x: options.value_format % x, d)) + "\n ") elif method == "compute": # make room for all internal nodes and one dummy node # for unrooted trees. max_index = TreeTools.GetMaxIndex(tree) + 2 variances = [None] * max_index values = [[None] * nrows for x in range(max_index)] contrasts = [] for x in range(max_index): contrasts.append([None] * ncolumns) branchlengths = [None] * max_index def update_data( node_id, bl, c1, c2, ): b1, b2 = branchlengths[c1], branchlengths[c2] rb1 = 1.0 / b1 rb2 = 1.0 / b2 # compute variance variance = math.sqrt(b1 + b2) # extend branch length of this node to create correct # variance for parent branchlengths[node_id] = bl + (b1 * b2) / (b1 + b2) variances[node_id] = variance for c in range(ncolumns): v1, v2 = values[c1][c], values[c2][c] # save ancestral value as weighted mean values[node_id][c] = ( (rb1 * v1 + rb2 * v2)) / (rb1 + rb2) # compute normalized contrast contrasts[node_id][c] = (v1 - v2) / variance def update_contrasts(node_id): """update contrasts for a node.""" node = tree.node(node_id) if node.succ: if len(node.succ) == 2: c1, c2 = node.succ update_data(node_id, node.data.branchlength, c1, c2) else: assert (node_id == tree.root) assert (len(node.succ) == 3) update_data(node_id, node.data.branchlength, node.succ[0], node.succ[1]) update_data(max_index - 1, node.data.branchlength, node_id, node.succ[2]) else: for c in range(ncolumns): values[node_id][c] = float( data[map_node2data[node_id]][c + 1]) branchlengths[node_id] = node.data.branchlength tree.dfs(tree.root, post_function=update_contrasts) options.stdout.write("node_id\tvariance\t%s\n" % "\t".join(headers)) for node_id in range(max_index): if variances[node_id] is None: continue options.stdout.write("%s\t%s\t%s\n" % ( node_id, options.value_format % variances[node_id], "\t".join( map(lambda x: options.value_format % x, contrasts[node_id])), )) E.Stop()
def calc_indirectly_std_ratios(summset, popset, stdsummset, stdpopset, conflev=0.95, baseratio=100, timeinterval='years', popset_popcol='_freq_', stdpopset_popcol='_stdpop_', ci_method='daly', debug=False): """ Calculate Indirectly Standardised Population Event Ratios - summset is a summary dataset of counts of events for the population-of-interest being compared to the standard population. - popset is the stratified population counts for the population-of-interest - stdsummset is a summary dataset of counts of events for the standard population - stdpopset is the stratified population counts for the standard population """ from rpy import r, get_default_mode, set_default_mode, BASIC_CONVERSION alpha = get_alpha(conflev) if ci_method != 'daly': raise Error("Only Daly method for confidence intervals " "currently implemented") if not popset.has_column(popset_popcol): raise Error('Denominator population dataset %r does not have a ' '%r column' % (popset.label or popset.name, popset_popcol)) if not stdpopset.has_column(stdpopset_popcol): raise Error('Standard population dataset %r does not have a ' '%r column' % (stdpopset.label or stdpopset.name, stdpopset_popcol)) st = time.time() r_mode = get_default_mode() try: set_default_mode(BASIC_CONVERSION) shape = shape_union(stdsummset, summset) summtab = CrossTab.from_summset(summset, shaped_like=shape) stdsummtab = CrossTab.from_summset(stdsummset, shaped_like=shape) stdpoptab = CrossTab.from_summset(stdpopset, shaped_like=shape) stdpoptab.collapse_axes_not_in(stdsummtab) stdsummtab.replicate_axes(shape) stdpoptab.replicate_axes(shape) poptab = CrossTab.from_summset(popset, shaped_like=shape) poptab.collapse_axes_not_in(shape) if poptab.get_shape() != stdsummtab.get_shape(): raise Error( 'Observed population does not have all the required columns') popfreq = poptab[popset_popcol].data.astype(MA.Float64) result = stdsummtab.empty_copy() result.add_table('popfreq', data=popfreq, label='Total person-' + timeinterval + ' at risk') expected_cols = [] for table, name, n_add, l_add in just_freq_tables(stdsummtab): stdsummfreq = stdsummtab[name].data.astype(MA.Float64) stdpopfreq = stdpoptab[stdpopset_popcol].data.astype(MA.Float64) std_strata_rates = stdsummfreq / stdpopfreq strata_expected_freq = std_strata_rates * popfreq # print stdsummfreq[0,0,0], stdpopfreq[0,0,0], popfreq[0,0,0] result.add_table('expected' + n_add, data=strata_expected_freq, label='Expected events' + l_add) expected_cols.append('expected' + n_add) result.collapse_axes_not_in(summtab) axis = 0 baseratio = float(baseratio) for table, name, n_add, l_add in just_freq_tables(summtab): observed = table.data.astype(Numeric.Float64) result.add_table('observed' + n_add, data=observed, label='Observed events' + l_add) expected = result['expected' + n_add].data isr = observed / expected result.add_table('isr' + n_add, data=isr * baseratio, label='Indirectly Standardised Event Ratio') # Confidence Intervals if alpha is None or name != '_freq_': # Can only calculate confidence intervals on freq cols continue conflev_l = (1 - conflev) / 2.0 conflev_u = (1 + conflev) / 2.0 # get shape of observed observed_shape = observed.shape # flattened version observed_flat = MA.ravel(observed) # sanity check on shapes - should be the same! assert expected.shape == observed.shape # flattened version of expecetd expected_flat = MA.ravel(expected) # lists to hold results isr_ll = Numeric.empty(len(observed_flat), typecode=Numeric.Float64) isr_ul = Numeric.empty(len(observed_flat), typecode=Numeric.Float64) isr_ll_mask = Numeric.zeros(len(observed_flat), typecode=Numeric.Int8) isr_ul_mask = Numeric.zeros(len(observed_flat), typecode=Numeric.Int8) obs_mask = MA.getmaskarray(observed_flat) exp_mask = MA.getmaskarray(expected_flat) for i, v in enumerate(observed_flat): if obs_mask[i] or exp_mask[i]: isr_ll[i] = 0.0 isr_ul[i] = 0.0 isr_ll_mask[i] = 1 isr_ul_mask[i] = 1 else: if v == 0.: obs_ll = 0.0 obs_ul = -math.log(1 - conflev) else: obs_ll = r.qgamma(conflev_l, v, scale=1.) obs_ul = r.qgamma(conflev_u, v + 1., scale=1.) isr_ll[i] = obs_ll / expected_flat[i] isr_ul[i] = obs_ul / expected_flat[i] isr_ll = MA.array(isr_ll, typecode=MA.Float64, mask=isr_ll_mask) isr_ul = MA.array(isr_ul, typecode=MA.Float64, mask=isr_ul_mask) isr_ll.shape = observed_shape isr_ul.shape = observed_shape isr_base = 'ISR %d%%' % (100.0 * conflev) result.add_table('isr_ll' + n_add, data=isr_ll * baseratio, label=isr_base + ' lower confidence limit' + l_add) result.add_table('isr_ul' + n_add, data=isr_ul * baseratio, label=isr_base + ' upper confidence limit' + l_add) finally: set_default_mode(r_mode) soom.info('calc_indirectly_std_ratios took %.03f' % (time.time() - st)) name = 'indir_std_ratios_' + summset.name label = 'Indirectly Standardised Ratios for ' + (summset.label or summset.name) if conflev: label += ' (%g%% conf. limits)' % (conflev * 100) if debug: global vars vars = Vars(locals()) return result.to_summset(name, label=label)
def main(parameter_file): """ It performs the following actions: 1. Gets the parameters, required for simulation, from parameter.yaml file. 2. calls DEM_creator() --> for generating DEM grid 3. Erosion modelling 4. Flow modelling 5. Landcover class allocation using decision tree 6. Geometric feature development 7. road mapping """ time1 = time.time() #*****************parameter handling ************************************* # Get the parameters from parameter.yaml file yaml_file = open(parameter_file, 'r') stream = yaml.load(yaml_file) resolution = stream['resolution'] H = stream['H'] H_wt = stream['H_wt'] seed = stream['seed'] sigma = stream['sigma'] elev_range = stream['elev_range'] max_level = stream['max_level'] DEMcreator_option = stream['DEMcreator_option'] output_dir = stream['output_dir'] river_drop = stream['river_drop'] Erosion_permission = stream['Erosion_permission'] decision_tree = stream['decision_tree'] counter = stream['counter'] elev_filename = stream['training_elev_filename'] landcover_filename = stream['training_landcover_filename'] river_filename = stream['training_river_filename'] no_of_veg_class = stream['no_of_veg_class'] min_area = stream['min_area'] max_area = stream['max_area'] aspect_ratio = stream['aspect_ratio'] agri_area_limit = stream['agri_area_limit'] yaml_file.close() #**************************print statistics*********************************** print ("Running simulation with follwing parameters") print ("H: %s" % H) print ("H_wt: %s" % H_wt) print ("seed: %s" % seed) print ("sigma: %f" % sigma) print ("elev_range: %s" % elev_range) print ("max_level: %s" % max_level) print ("DEMcreator_option: %s" % DEMcreator_option) print ("output_dir: %s" % output_dir) print ("River drop: %d" % river_drop) print ("counter: %d" % counter) print ("no of vegetation class %d" % no_of_veg_class) print ("min area: %f" % min_area) print ("max area: %f" % max_area) print ("aspect ratio: %f" % aspect_ratio) print ("agricultural area limit: %f" % agri_area_limit) gradient = 0 #fixed for now TODO incorporate gradient in next version #*****************************DEM genaration************************************ # Generate DEM using FM2D/SS algorithm by calling DEM_creator(args...) function DEM_Result = DEM_generator.DEM_creator(H, H_wt, seed, elev_range,sigma,gradient,max_level, DEMcreator_option) pathname = os.path.dirname(sys.argv[0]) fullpath = os.path.abspath(pathname) filename = fullpath + "/" + output_dir if not os.path.exists(filename): os.makedirs(filename) # create output directory if it doesn't exist DEM_arr = DEM_Result[0] DEM_Result = 0 #free space #****************************region adjustment*********************************** # We create a temporary region that is only valid in this python session g.use_temp_region() rows = DEM_arr.shape[0] cols = DEM_arr.shape[1] n = 4928050 #some arbitrary value s = n - resolution*rows e = 609000 #some arbitrary value w = e - resolution*cols g.run_command('g.region', flags = 'ap', n = n ,s = s, e = e, w = w,res = resolution, rows = rows ,cols = cols) #*************************Flow accumulation with Erosion modelling**************************** filename = fullpath + "/ascii_files" if not os.path.exists(filename): os.makedirs(filename) if not Erosion_permission: counter = 0 DEM_arr_to_ascii(DEM_arr,resolution) g.run_command('r.in.ascii', overwrite = True, flags='i', input = fullpath +'/'+'ascii_files' +'/DEM.asc', output='test_DEM') #Flow computation for massive grids (float version) g.run_command('r.terraflow', overwrite = True, elevation = 'test_DEM@user1', filled = 'flooded_DEM',\ direction = 'DEM_flow_direction',swatershed = 'DEM_sink_watershed', accumulation = 'DEM_flow_accum', tci = 'DEM_tci') g.run_command('r.out.ascii',flags='h',input='DEM_flow_accum@user1',output=fullpath +'/ascii_files'+ '/DEM_flow_accum',null='0') f = open(fullpath +'/ascii_files'+ '/DEM_flow_accum', 'r') Flow_accum_arr = numpy.loadtxt(f) f.close() for iteration in range(0,counter): DEM_arr_to_ascii(DEM_arr,resolution) #Input the DEM ascii file into grass g.run_command('r.in.ascii', overwrite = True, flags='i', input = fullpath +'/'+'ascii_files' +'/DEM.asc', output='test_DEM') #Flow computation for massive grids (float version) g.run_command('r.terraflow', overwrite = True, elevation = 'test_DEM@user1', filled = 'flooded_DEM',\ direction = 'DEM_flow_direction',swatershed = 'DEM_sink_watershed', accumulation = 'DEM_flow_accum', tci = 'DEM_tci') g.run_command('r.out.ascii',flags='h',input='DEM_flow_accum@user1',output=fullpath +'/ascii_files'+ '/DEM_flow_accum',null='0') f = open(fullpath +'/ascii_files'+ '/DEM_flow_accum', 'r') Flow_accum_arr = numpy.loadtxt(f) f.close() #call erosion modelling function DEM_arr = Erosion(Flow_accum_arr, DEM_arr, river_drop) output=fullpath +'/'+output_dir+ '/DEM.asc' arr_to_ascii(DEM_arr,output) output=fullpath +'/'+output_dir+ '/flow_accum.asc' arr_to_ascii(Flow_accum_arr,output) #****************************landcover allocation using decision tree******************************** # Get slope and Aspect using grass functions g.run_command('r.slope.aspect',overwrite=True,elevation='test_DEM@user1',slope='DEM_Slope',aspect='DEM_Aspect') g.run_command('r.out.ascii',flags='h',input='DEM_Slope@user1',output=fullpath + '/ascii_files'+'/DEM_Slope',null='0') f = open('ascii_files/DEM_Slope', 'r') DEM_Slope_arr = numpy.loadtxt(f) f.close() g.run_command('r.out.ascii',flags='h',input='DEM_Aspect@user1',output=fullpath +'/ascii_files'+'/DEM_Aspect',null='0') f = open('ascii_files/DEM_Aspect', 'r') DEM_Aspect_arr = numpy.loadtxt(f) f.close() Distance_arr = dist.CityBlock(Flow_accum_arr,flag = 0) # Normalize the elevation values to use decision tree minimum_elev = numpy.min(DEM_arr) factor = numpy.max(DEM_arr) - minimum_elev Elev_arr = (DEM_arr[:,:] - minimum_elev)*100/factor # Create various list to hold test data Elevation = [] Slope = [] RiverDistance = [] Aspect = [] # Append the data into respective list x_len = DEM_arr.shape[0] y_len = DEM_arr.shape[1] for i in range(0,x_len): for j in range(0,y_len): Elevation.append(int(Elev_arr[i][j])) Slope.append(int(DEM_Slope_arr[i][j])) RiverDistance.append(int(Distance_arr[i][j])) Aspect.append(int(DEM_Aspect_arr[i][j])) Elev_arr = 0 #free space DEM_slope_arr = 0 #free space DEM_Aspect_arr = 0 #free space Distance_arr = 0 #free space # Create dictionary to apply R's predict command on it Test_data = {'Elevation':Elevation ,'Slope':Slope ,'RiverDistance':RiverDistance,'Aspect':Aspect} #free spaces Elevation = [] Slope = [] RiverDistance = [] Aspect = [] # create decision tree from training data fit = DecisionTree(no_of_veg_class,elev_filename, landcover_filename, river_filename,decision_tree) g.run_command('g.region', flags = 'ap', n = n ,s = s, e = e, w = w,res = resolution, rows = rows ,cols = cols) # Alloctae vegetation array for holding predicted landcover values Veg_arr = numpy.zeros(DEM_arr.shape, dtype = "uint8") rpy.r.library("rpart") rpy.set_default_mode(rpy.BASIC_CONVERSION) # values contain probability values of the predicted landcover classes values = rpy.r.predict(fit,newdata=Test_data,method="class") Test_data = 0 #free space x_len = Veg_arr.shape[0] y_len = Veg_arr.shape[1] for i in range(0,x_len): for j in range(0,y_len): # Get the class having max probability for each test data point a = ndimage.maximum_position(values[i*y_len + j]) Veg_arr[i,j] = (a[0]) # Assign them some value to facilitate visualization values = 0 #free space filename=fullpath +'/'+output_dir+ "/landcover.asc" arr_to_ascii(Veg_arr,filename) # Allocate and initialize Suitabilty map Suitability = numpy.zeros( DEM_arr.shape, dtype = "uint8") for i in range(0,DEM_arr.shape[0]): for j in range(0,DEM_arr.shape[1]): #TODO can use mask here, needs to be generalised if Veg_arr[i][j] == 0: # Ignore Suitability[i][j] = 0 elif Veg_arr[i][j] == 25: # Deciduous woodland Suitability[i][j] = 60 elif Veg_arr[i][j] == 50: # Coniferous woodland Suitability[i][j] = 55 elif Veg_arr[i][j] == 75: # Agriculture including pasture Suitability[i][j] = 98 elif Veg_arr[i][j] == 100: # Semi-natural grassland Suitability[i][j] = 90 elif Veg_arr[i][j] == 125: # Bog and swamp Suitability[i][j] = 50 elif Veg_arr[i][j] == 150: # Heath Suitability[i][j] = 75 elif Veg_arr[i][j] == 175: # Montane habitat Suitability[i][j] = 20 elif Veg_arr[i][j] == 200: # Rock and quarry Suitability[i][j] = 30 elif Veg_arr[i][j] == 225: # Urban Suitability[i][j] = 80 Display_fields = Geometry.GeometricFeature(Suitability, min_area,max_area ,aspect_ratio ,agri_area_limit) f = open('fields_arr', 'w') numpy.save(f,Display_fields) f.close() pylab.imsave(output_dir+"/fields.png",Display_fields) time2 = time.time() print "time taken", time2-time1 shutil.rmtree(fullpath+'/ascii_files')
def DecisionTree(no_of_veg_class, elev_filename, landcover_filename, river_filename,decision_tree): """ Generates a decision tree given the training data Input: no_of_veg_class: No of landcover class in training data elev_filename : Name of training file having elevation values landcover_filename: Name of training file having landcover values river_filename: Name of training file having river presence absence info """ rpy.r.library("rpart") g.use_temp_region() #TODO generalize no of rows and columns for training data rows = 2001 cols = 1201 resolution = 50 n = 4928050 #some arbitrary value s = n - resolution*rows e = 609000 #some arbitrary value w = e - resolution*cols g.run_command('g.region', flags = 'ap', n = n ,s = s, e = e, w = w,res = 50, rows = 2001 ,cols = 1201) pathname = os.path.dirname(sys.argv[0]) fullpath = os.path.abspath(pathname) if decision_tree: # Convert ascii DEM into grass raster map that will help in getting slope and aspect file_name = "/Training/%s" % elev_filename g.run_command('r.in.ascii', overwrite = True, flags='i', input = fullpath + file_name, output='training_DEM') # TODO read training DEM into array without writing another file g.run_command('r.out.ascii',flags='h',input='training_DEM@user1',output=fullpath + '/ascii_files'+'/training_DEM',null='0') f = open('ascii_files/training_DEM', 'r') Elev_arr = numpy.loadtxt(f) f.close() file_name = "Training/%s" % (landcover_filename) Landcover = numpy.loadtxt(file_name) # Read Landcover Data from ascii file file_name = "Training/%s" % (river_filename) River = numpy.loadtxt(file_name) # Read River Data from ascii file River_dist_arr = dist.CityBlock(River,flag = 1) #Compute distance from River data g.run_command('r.slope.aspect',overwrite=True,elevation='training_DEM@user1',slope='Slope',aspect='Aspect') g.run_command('r.out.ascii',flags='h',input='Slope@user1',output=fullpath + '/ascii_files'+'/Slope',null='0') f = open('ascii_files/Slope', 'r') Slope_arr = numpy.loadtxt(f) #Get Slope into an array f.close() g.run_command('r.out.ascii',flags='h',input='Aspect@user1',output=fullpath +'/ascii_files'+ '/Aspect',null='0') f = open('ascii_files/Aspect', 'r') Aspect_arr = numpy.loadtxt(f) #Get Aspect into an array f.close() (x_len,y_len) = Elev_arr.shape L = [ [] for i in range(0,no_of_veg_class)] for i in range(1,x_len-1): # Ignoring boundary cells for j in range(1,y_len-1): # Append the pixel co-ordinates into respective list of lists # nodata values already gets handled since we are ignoring it for k in range(0, no_of_veg_class): if Landcover[i][j] == k: L[k].append( (i,j) ) break minimum_elev = numpy.min(Elev_arr) factor = numpy.max(Elev_arr) - minimum_elev # normalize elevation data Elev_arr = (Elev_arr[:,:]-minimum_elev)*100/factor # Sample training Data for decision tree, we can't take entire data as it take longer processing time # various lists to hold sample training data Elevation = [] Slope = [] RiverDistance = [] Aspect = [] Class = [] # Sample the data for i in range(0,no_of_veg_class): if len(L[i]) < 1000: limit = len(L[i]) else: limit = 1000 for j in range(0,limit): Elevation.append( int(Elev_arr[ L[i][j][0] ][ L[i][j][1] ])) Slope.append(int(Slope_arr[ L[i][j][0] ][ L[i][j][1] ])) RiverDistance.append(int(River_dist_arr[ L[i][j][0] ][ L[i][j][1] ])) Aspect.append(int(Aspect_arr[ L[i][j][0] ][ L[i][j][1] ])) Class.append(i) #free space Elev_arr = 0 Slope_arr = 0 River_dist_arr = 0 Aspect_arr = 0 # create dictionary of sample data which will be needed to generate decision tree training_data = {'Elevation':Elevation,'Slope':Slope,'RiverDistance':RiverDistance,'Aspect':Aspect,'Class':Class} #free space Elevation = [] Slope = [] RiverDistance = [] Aspect = [] Class = [] f = open( 'save.p', 'w' ) pickle.dump(training_data, f ) f.close() else: f = open( 'save.p', 'r' ) training_data = pickle.load( f ) f.close() rpy.set_default_mode(rpy.NO_CONVERSION) #Using rpart create the decision tree fit = rpy.r.rpart(formula='Class ~ Elevation + RiverDistance + Slope + Aspect',data=training_data,method="class") training_data = 0 #rpy.r.png("DecisionTree.png") # Output a png image of the decision tree #rpy.r.plot(fit) #rpy.r.text(fit) #rpy.r.dev_off() return fit
def __init__(self, y, design, model_type=r.lm, **kwds): ''' Set up and estimate R model with data and design ''' r.library('MASS') # still needs to be in test, but also here for # logical tests at the end not to show an error self.y = np.array(y) self.design = np.array(design) self.model_type = model_type self._design_cols = [ 'x.%d' % (i + 1) for i in range(self.design.shape[1]) ] # Note the '-1' for no intercept - this is included in the design self.formula = r('y ~ %s-1' % '+'.join(self._design_cols)) self.frame = r.data_frame(y=y, x=self.design) rpy.set_default_mode(rpy.NO_CONVERSION) results = self.model_type(self.formula, data=self.frame, **kwds) self.robj = results # keep the Robj model so it can be # used in the tests rpy.set_default_mode(rpy.BASIC_CONVERSION) rsum = r.summary(results) self.rsum = rsum # Provide compatible interface with scipy models self.results = results.as_py() # coeffs = self.results['coefficients'] # self.beta0 = np.array([coeffs[c] for c in self._design_cols]) self.nobs = len(self.results['residuals']) if isinstance(self.results['residuals'], dict): self.resid = np.zeros((len(self.results['residuals'].keys()))) for i in self.results['residuals'].keys(): self.resid[int(i) - 1] = self.results['residuals'][i] else: self.resid = self.results['residuals'] self.fittedvalues = self.results['fitted.values'] self.df_resid = self.results['df.residual'] self.params = rsum['coefficients'][:, 0] self.bse = rsum['coefficients'][:, 1] self.bt = rsum['coefficients'][:, 2] try: self.pvalues = rsum['coefficients'][:, 3] except: pass self.rsquared = rsum.setdefault('r.squared', None) self.rsquared_adj = rsum.setdefault('adj.r.squared', None) self.aic_R = rsum.setdefault('aic', None) self.fvalue = rsum.setdefault('fstatistic', None) if self.fvalue and isinstance(self.fvalue, dict): self.fvalue = self.fvalue.setdefault('value', None) # for wls df = rsum.setdefault('df', None) if df: # for RLM, works for other models? self.df_model = df[0] - 1 # R counts intercept self.df_resid = df[1] self.bcov_unscaled = rsum.setdefault('cov.unscaled', None) self.bcov = rsum.setdefault('cov.scaled', None) if 'sigma' in rsum: self.scale = rsum['sigma'] elif 'dispersion' in rsum: self.scale = rsum['dispersion'] else: self.scale = None self.llf = r.logLik(results) if model_type == r.glm: self.getglm() if model_type == r.rlm: self.getrlm()
if __name__ == '__main__': modules = os.listdir('.') if '--random' in sys.argv: shuffle=True sys.argv.remove('--random') else: shuffle=False if '--loop' in sys.argv: niter = 1000 sys.argv.remove('--loop') else: niter = 1 modules = filter( lambda x: not x.endswith('.pyc'), modules) modules = filter( lambda x: x.startswith('test_'), modules) modules = filter( lambda x: x.endswith('.py'), modules) print "Modules to be tested:", modules for iter in range(niter): if shuffle: random.shuffle(modules) for module in modules: name = module[:-3] print 'Testing:', name rpy.set_default_mode(rpy.NO_DEFAULT) # reset to base case run(name)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-c", "--columns", dest="columns", type="string", help="columns to take for calculating histograms.") parser.add_option("-t", "--tree-nh-file", dest="filename_tree", type="string", help="filename with tree(s).") parser.add_option("--skip-header", dest="add_header", action="store_false", help="do not add header to flat format.") parser.add_option("--output-with-header", dest="write_header", action="store_true", help="write header and exit.") parser.add_option("--debug", dest="debug", action="store_true", help="debug mode") parser.add_option("--display-tree", dest="display_tree", action="store_true", help="display the tree") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("contrasts", "spearman", "pearson", "compute"), help="methods to perform on contrasts.") parser.set_defaults( columns="all", filename_tree=None, add_header=True, write_header=False, debug=False, methods=[], value_format="%6.4f", pvalue_format="%e", display_tree=False, ) (options, args) = E.Start(parser, quiet=True) if options.columns not in ("all", "all-but-first"): options.columns = map(lambda x: int(x) - 1, options.columns.split(",")) phylip = WrapperPhylip.Phylip() if options.debug: phylip.setLogLevel(options.loglevel) phylip.setProgram("contrast") ########################################################## ########################################################## ########################################################## # retrieve data and give to phylip data = [] headers = [] first = True for line in sys.stdin: if line[0] == "#": continue d = line[:-1].strip().split("\t") if first: first = False headers = d[1:] continue data.append(d) phylip.setData(data) ncolumns = len(headers) nrows = len(data) ########################################################## ########################################################## ########################################################## # read trees nexus = None if options.filename_tree: nexus = TreeTools.Newick2Nexus(open(options.filename_tree, "r")) if not nexus: raise ValueError("please provide trees with branchlenghts") ########################################################## ########################################################## ########################################################## # set up phylip phylip_options = [] # print out contrasts phylip_options.append("C") phylip_options.append("Y") phylip.setOptions(phylip_options) ########################################################## ########################################################## ########################################################## # main loop ########################################################## for tree in nexus.trees: if options.display_tree: tree.display() # compute this before giving the tree to the phylip module, # as it remaps taxon names. map_node2data = {} for x in range(nrows): taxon = data[x][0] map_node2data[tree.search_taxon(taxon)] = x phylip.setTree(tree) result = phylip.run() for method in options.methods: if method in ("pearson", "spearman"): options.stdout.write("header1\theader2\tr\tp\tcode\n") # n = len(result.mContrasts) columns = [] for c in range(ncolumns): columns.append(map(lambda x: x[c], result.mContrasts)) for x in range(0, ncolumns - 1): for y in range(x + 1, ncolumns): # phylip value phy_r = result.mCorrelations[x][y] import rpy from rpy import r as R # Various ways to calculate r. It is not # possible to use cor.test or lsfit directly, # as you have to perform a regression through # the origin. # uncomment to check pearson r against # phylip's value r = # calculateCorrelationCoefficient(columns[x], # columns[y]) # for significance, use linear regression models in R rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = R.lm( R("y ~ x - 1"), data=R.data_frame(x=columns[x], y=columns[y])) rpy.set_default_mode(rpy.BASIC_CONVERSION) ss = R.summary(linear_model) # extract the p-value p = ss['coefficients'][-1][-1] if p < 0.001: code = "***" elif p < 0.01: code = "**" elif p < 0.05: code = "*" else: code = "" options.stdout.write("\t".join( (headers[x], headers[y], options.value_format % phy_r, options.pvalue_format % p, code)) + "\n") elif method == "contrasts": options.stdout.write("\t".join(headers) + "\n") for d in result.mContrasts: options.stdout.write( "\t".join( map(lambda x: options.value_format % x, d)) + "\n") elif method == "compute": # make room for all internal nodes and one dummy node # for unrooted trees. max_index = TreeTools.GetMaxIndex(tree) + 2 variances = [None] * max_index values = [[None] * nrows for x in range(max_index)] contrasts = [] for x in range(max_index): contrasts.append([None] * ncolumns) branchlengths = [None] * max_index def update_data(node_id, bl, c1, c2, ): b1, b2 = branchlengths[c1], branchlengths[c2] rb1 = 1.0 / b1 rb2 = 1.0 / b2 # compute variance variance = math.sqrt(b1 + b2) # extend branch length of this node to create correct # variance for parent branchlengths[node_id] = bl + (b1 * b2) / (b1 + b2) variances[node_id] = variance for c in range(ncolumns): v1, v2 = values[c1][c], values[c2][c] # save ancestral value as weighted mean values[node_id][c] = ( (rb1 * v1 + rb2 * v2)) / (rb1 + rb2) # compute normalized contrast contrasts[node_id][c] = (v1 - v2) / variance def update_contrasts(node_id): """update contrasts for a node.""" node = tree.node(node_id) if node.succ: if len(node.succ) == 2: c1, c2 = node.succ update_data( node_id, node.data.branchlength, c1, c2) else: assert(node_id == tree.root) assert(len(node.succ) == 3) update_data( node_id, node.data.branchlength, node.succ[0], node.succ[1]) update_data( max_index - 1, node.data.branchlength, node_id, node.succ[2]) else: for c in range(ncolumns): values[node_id][c] = float( data[map_node2data[node_id]][c + 1]) branchlengths[node_id] = node.data.branchlength tree.dfs(tree.root, post_function=update_contrasts) options.stdout.write( "node_id\tvariance\t%s\n" % "\t".join(headers)) for node_id in range(max_index): if variances[node_id] is None: continue options.stdout.write("%s\t%s\t%s\n" % ( node_id, options.value_format % variances[ node_id], "\t".join( map(lambda x: options.value_format % x, contrasts[node_id])), )) E.Stop()
def rpart_fit(self, known_data, parameter_list, bit_string="11111"): """ 11-09-05 1st use known_data to get the fit model 2nd use the fit model to do prediction on all_data, result is prob for each class 11-09-05 add rpart_cp 11-17-05 add loss_matrix, prior_prob return two pred 11-23-05 split fit and predict. rpart_fit_and_predict() is split into rpart_fit() and rpart_predict() 11-27-05 r cleanup 03-17-06 use parameter_list instead """ if self.debug: sys.stderr.write("Doing rpart_fit...\n") # 03-17-06 rpart_cp, loss_matrix, prior_prob = parameter_list # 11-27-05 r cleanup from rpy import r r.library("rpart") coeff_name_list = ["p_value", "recurrence", "connectivity", "cluster_size", "gradient"] formula_list = [] for i in range(len(bit_string)): if bit_string[i] == "1": formula_list.append(coeff_name_list[i]) # 11-17-05 transform into array known_data = array(known_data) set_default_mode(NO_CONVERSION) data_frame = r.as_data_frame( { "p_value": known_data[:, 0], "recurrence": known_data[:, 1], "connectivity": known_data[:, 2], "cluster_size": known_data[:, 3], "gradient": known_data[:, 4], "is_correct": known_data[:, -1], } ) if prior_prob: prior_prob = [prior_prob, 1 - prior_prob] # get the full list fit = r.rpart( r("is_correct~%s" % "+".join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp), parms=r.list(prior=prior_prob, loss=r.matrix(loss_matrix)), ) else: fit = r.rpart( r("is_correct~%s" % "+".join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp), parms=r.list(loss=r.matrix(loss_matrix)), ) del data_frame if self.debug: sys.stderr.write("Done rpart_fit.\n") return fit
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = [ "rFile=", "chr=", "delim=", "missingval=", "BoundaryStart=", "removeOutliers=", "addConstant=", "logTransform", "BoundaryEnd=", "phenotypeFileType=", "help", "parallel=", "parallelAll", "LRT", "minMAF=", "kinshipDatafile=", "phenotypeRanks", "onlyMissing", "onlyOriginal96", "onlyOriginal192", "onlyBelowLatidue=", "complement", "negate", "srInput=", "sr", "srOutput=", "srPar=", "srSkipFirstRun", "testRobustness", "permutationFilter=", "useLinearRegress", "regressionCofactors=", "FriLerAsCofactor", "FriColAsCofactor", "memReq=", "walltimeReq=", ] try: opts, args = getopt.getopt(sys.argv[1:], "o:c:d:m:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeRanks = False removeOutliers = None addConstant = -1 phenotypeFileType = 1 rFile = None delim = "," missingVal = "NA" help = 0 minMAF = 0.0 boundaries = [-1, -1] chr = None parallel = None logTransform = False negate = False parallelAll = False lrt = False kinshipDatafile = None onlyMissing = False onlyOriginal96 = False onlyOriginal192 = False onlyBelowLatidue = None complement = False sr = False srOutput = False srInput = False srSkipFirstRun = False srTopQuantile = 0.95 srWindowSize = 30000 testRobustness = False permutationFilter = 0.002 useLinearRegress = False regressionCofactors = None FriLerAsCofactor = False FriColAsCofactor = False memReq = "5g" walltimeReq = "150:00:00" for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-o", "--rFile"): rFile = arg elif opt in ("--phenotypeFileType"): phenotypeFileType = int(arg) elif opt in ("--BoundaryStart"): boundaries[0] = int(arg) elif opt in ("--BoundaryEnd"): boundaries[1] = int(arg) elif opt in ("--addConstant"): addConstant = float(arg) elif opt in ("--parallel"): parallel = arg elif opt in ("--minMAF"): minMAF = float(arg) elif opt in ("--parallelAll"): parallelAll = True elif opt in ("--onlyMissing"): onlyMissing = True elif opt in ("--onlyOriginal96"): onlyOriginal96 = True elif opt in ("--onlyOriginal192"): onlyOriginal192 = True elif opt in ("--onlyBelowLatidue"): onlyBelowLatidue = float(arg) elif opt in ("--complement"): complement = True elif opt in ("--logTransform"): logTransform = True elif opt in ("--negate"): negate = True elif opt in ("--removeOutliers"): removeOutliers = float(arg) elif opt in ("--LRT"): lrt = True elif opt in ("-c", "--chr"): chr = int(arg) elif opt in ("-d", "--delim"): delim = arg elif opt in ("-m", "--missingval"): missingVal = arg elif opt in ("--kinshipDatafile"): kinshipDatafile = arg elif opt in ("--phenotypeRanks"): phenotypeRanks = True elif opt in ("--sr"): sr = True elif opt in ("--srSkipFirstRun"): srSkipFirstRun = True elif opt in ("--srInput"): srInput = arg elif opt in ("--srOutput"): srOutput = arg elif opt in ("--srPar"): vals = arg.split(",") srTopQuantile = float(vals[0]) srWindowSize = int(vals[1]) elif opt in ("--testRobustness"): testRobustness = True elif opt in ("--permutationFilter"): permutationFilter = float(arg) elif opt in ("--FriLerAsCofactor"): FriLerAsCofactor = True elif opt in ("--FriColAsCofactor"): FriColAsCofactor = True elif opt in ("--useLinearRegress"): useLinearRegress = True elif opt in ("--regressionCofactors"): regressionCofactors = arg elif opt in ("--memReq"): memReq = arg elif opt in ("--walltimeReq"): walltimeReq = arg else: if help == 0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args) < 3 and not parallel: if help == 0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) print "Emma is being set up with the following parameters:" print "output:", rFile print "phenotypeRanks:", phenotypeRanks print "phenotypeFileType:", phenotypeFileType print "parallel:", parallel print "parallelAll:", parallelAll print "minMAF:", minMAF print "LRT:", lrt print "delim:", delim print "missingval:", missingVal print "kinshipDatafile:", kinshipDatafile print "chr:", chr print "boundaries:", boundaries print "onlyMissing:", onlyMissing print "onlyOriginal96:", onlyOriginal96 print "onlyOriginal192:", onlyOriginal192 print "onlyBelowLatidue:", onlyBelowLatidue print "complement:", complement print "negate:", negate print "logTransform:", logTransform print "addConstant:", addConstant print "removeOutliers:", removeOutliers print "sr:", sr print "srSkipFirstRun:", srSkipFirstRun print "srInput:", srInput print "srOutput:", srOutput print "srTopQuantile:", srTopQuantile print "srWindowSize:", srWindowSize print "testRobustness:", testRobustness print "permutationFilter:", permutationFilter print "useLinearRegress:", useLinearRegress print "regressionCofactors:", regressionCofactors print "FriLerAsCofactor:", FriLerAsCofactor print "FriColAsCofactor:", FriColAsCofactor print "walltimeReq:", walltimeReq print "memReq:", memReq def runParallel(phenotypeIndex, phed): #Cluster specific parameters print phenotypeIndex phenName = phed.getPhenotypeName(phenotypeIndex) outFileName = resultDir + "Emma_" + parallel + "_" + phenName shstr = "#!/bin/csh\n" shstr += "#PBS -l walltime=" + walltimeReq + "\n" shstr += "#PBS -l mem=" + memReq + "\n" shstr += "#PBS -q cmb\n" shstr += "#PBS -N E" + phenName + "_" + parallel + "\n" shstr += "set phenotypeName=" + parallel + "\n" shstr += "set phenotype=" + str(phenotypeIndex) + "\n" if useLinearRegress: outFileName = resultDir + "LR_" + parallel + "_" + phenName shstr += "(python " + emmadir + "Emma.py -o " + outFileName + " " if useLinearRegress: shstr += " --useLinearRegress " if regressionCofactors: shstr += " --regressionCofactors=" + str(regressionCofactors) + " " if FriLerAsCofactor: shstr += " --FriLerAsCofactor " if FriColAsCofactor: shstr += " --FriColAsCofactor " if onlyOriginal96: shstr += " --onlyOriginal96 " elif onlyOriginal192: shstr += " --onlyOriginal192 " if onlyBelowLatidue: shstr += " --onlyBelowLatidue=" + str(onlyBelowLatidue) + " " if logTransform: shstr += " --logTransform " if negate: shstr += " --negate " if removeOutliers: shstr += " --removeOutliers=" + str(removeOutliers) + " " if phenotypeRanks: shstr += " --phenotypeRanks " if testRobustness: shstr += " --testRobustness " shstr += " --permutationFilter=" + str(permutationFilter) + " " if sr: shstr += " --sr " if not srOutput: output = resultDir + "Emma_" + parallel + "_" + phenName + ".sr.pvals" shstr += " --srOutput=" + str(output) + " " if srSkipFirstRun: if not srInput: output = resultDir + "Emma_" + parallel + "_" + phenName + ".pvals" shstr += " --srInput=" + str(output) + " " shstr += " --srSkipFirstRun " shstr += " --srPar=" + str(srTopQuantile) + "," + str( srWindowSize) + " " if kinshipDatafile: shstr += " --kinshipDatafile=" + str(kinshipDatafile) + " " shstr += " --addConstant=" + str(addConstant) + " " shstr += snpsDataFile + " " + phenotypeDataFile + " " + str( phenotypeIndex) + " " shstr += "> " + outFileName + "_job" + ".out) >& " + outFileName + "_job" + ".err\n" f = open(parallel + ".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub " + parallel + ".sh ") snpsDataFile = args[0] phenotypeDataFile = args[1] if parallel: #Running on the cluster.. phed = phenotypeData.readPhenotypeFile( phenotypeDataFile, delimiter='\t') #Get Phenotype data if parallelAll: for phenotypeIndex in phed.phenIds: if onlyMissing: phenName = phed.getPhenotypeName(phenotypeIndex) pvalFile = resultDir + "Emma_" + parallel + "_" + phenName + ".pvals" res = None try: res = os.stat(pvalFile) except Exception: print "File", pvalFile, "does not exist." if res and res.st_size > 0: print "File", pvalFile, "already exists, and is non-empty." if sr: srInput = resultDir + "Emma_" + parallel + "_" + phenName + ".sr.pvals" srRes = None try: srRes = os.stat(srInput) except Exception: print "File", srInput, "does not exist." if srRes and srRes.st_size > 0: print "File", srInput, "already exists, and is non-empty." else: runParallel(phenotypeIndex, phed) else: print "Setting up the run." runParallel(phenotypeIndex, phed) else: runParallel(phenotypeIndex, phed) else: phenotypeIndex = int(args[2]) runParallel(phenotypeIndex, phed) return else: phenotypeIndex = int(args[2]) print "phenotypeIndex:", phenotypeIndex print "\nStarting program now!\n" snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=delim, missingVal=missingVal) #Load phenotype file phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data numAcc = len(snpsds[0].accessions) #Removing outliers if removeOutliers: print "Remoing outliers" phed.naOutliers(phenotypeIndex, removeOutliers) #If onlyOriginal96, then remove all other phenotypes.. if onlyOriginal96: print "Filtering for the first 96 accessions" original_96_ecotypes = phenotypeData._getFirst96Ecotypes_() original_96_ecotypes = map(str, original_96_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_96_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_96_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyOriginal192: print "Filtering for the first 192 accessions" original_192_ecotypes = phenotypeData._getFirst192Ecotypes_() original_192_ecotypes = map(str, original_192_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_192_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_192_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyBelowLatidue: print "Filtering for the accessions which orginate below latitude", onlyBelowLatidue eiDict = phenotypeData._getEcotypeIdInfoDict_() print eiDict keepEcotypes = [] for acc in phed.accessions: acc = int(acc) if eiDict.has_key(acc) and eiDict[acc][ 2] and eiDict[acc][2] < onlyBelowLatidue: keepEcotypes.append(str(acc)) elif eiDict.has_key(acc) and eiDict[acc][2] == None: keepEcotypes.append(str(acc)) phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) sys.stdout.write("Finished prefiltering phenotype accessions.\n") sys.stdout.flush() phenotype = phed.getPhenIndex(phenotypeIndex) accIndicesToKeep = [] phenAccIndicesToKeep = [] #Checking which accessions to keep and which to remove . for i in range(0, len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2 = phed.accessions[j] if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break print "\nFiltering accessions in genotype data:" #Filter accessions which do not have the phenotype value (from the genotype data). for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc - len( accIndicesToKeep ), "accessions removed from genotype data, leaving", len( accIndicesToKeep), "accessions in all." print "\nNow filtering accessions in phenotype data:" phed.removeAccessions( phenAccIndicesToKeep ) #Removing accessions that don't have genotypes or phenotype values print "Verifying number of accessions: len(phed.accessions)==len(snpsds[0].accessions) is", len( phed.accessions) == len(snpsds[0].accessions) if len(phed.accessions) != len(snpsds[0].accessions): raise Exception #Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps" #Remove minor allele frequencies if minMAF != 0: sys.stdout.write("Filterting SNPs with MAF<" + str(minMAF) + ".") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.filterMinMAF(minMAF) #Removing SNPs which are outside of boundaries. if chr: print "\nRemoving SNPs which are outside of boundaries." snpsds[chr - 1].filterRegion(boundaries[0], boundaries[1]) snpsds = [snpsds[chr - 1]] #Ordering accessions in genotype data to fit phenotype data. print "Ordering genotype data accessions." accessionMapping = [] i = 0 for acc in phed.accessions: if acc in snpsds[0].accessions: accessionMapping.append((snpsds[0].accessions.index(acc), i)) i += 1 #print zip(accessionMapping,snpsds[0].accessions) print "len(snpsds[0].snps)", len(snpsds[0].snps) for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.orderAccessions(accessionMapping) print "\nGenotype data has been ordered." #Converting format to 01 newSnpsds = [] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData(missingVal=missingVal)) print "" print "Checking kinshipfile:", kinshipDatafile if kinshipDatafile: #Is there a special kinship file? kinshipSnpsds = dataParsers.parseCSVData(kinshipDatafile, format=1, deliminator=delim, missingVal=missingVal) accIndicesToKeep = [] #Checking which accessions to keep and which to remove (genotype data). sys.stdout.write( "Removing accessions which do not have a phenotype value for " + phed.phenotypeNames[phenotype] + ".") sys.stdout.flush() for i in range(0, len(kinshipSnpsds[0].accessions)): acc1 = kinshipSnpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2 = phed.accessions[j] if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA': accIndicesToKeep.append(i) break print accIndicesToKeep for snpsd in kinshipSnpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc - len( accIndicesToKeep ), "accessions removed from kinship genotype data, leaving", len( accIndicesToKeep), "accessions in all." print "Ordering kinship data accessions." accessionMapping = [] i = 0 for acc in snpsds[0].accessions: if acc in kinshipSnpsds[0].accessions: accessionMapping.append( (kinshipSnpsds[0].accessions.index(acc), i)) i += 1 print zip(accessionMapping, snpsds[0].accessions) print "len(snpsds[0].snps)", len(snpsds[0].snps) for snpsd in kinshipSnpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.orderAccessions(accessionMapping) print "Kinship genotype data has been ordered." newKinshipSnpsds = [] sys.stdout.write("Converting data format") for snpsd in kinshipSnpsds: sys.stdout.write(".") sys.stdout.flush() newKinshipSnpsds.append(snpsd.getSnpsData( missingVal=missingVal)) #This data might have NAs print "" kinshipSnpsds = newKinshipSnpsds else: kinshipSnpsds = newSnpsds print "Found kinship data." #Ordering accessions according to the order of accessions in the genotype file # accessionMapping = [] # i = 0 # for acc in snpsds[0].accessions: # if acc in phed.accessions: # accessionMapping.append((phed.accessions.index(acc),i)) # i += 1 # phed.orderAccessions(accessionMapping) #Negating phenotypic values if negate: phed.negateValues(phenotypeIndex) if logTransform and not phed.isBinary( phenotypeIndex) and phed.getMinValue(phenotypeIndex) <= 0: addConstant = 0 #Adding a constant. if addConstant != -1: if addConstant == 0: addConstant = math.sqrt(phed.getVariance(phenotypeIndex)) / 10 addConstant = addConstant - phed.getMinValue(phenotypeIndex) print "Adding a constant to phenotype:", addConstant phed.addConstant(phenotypeIndex, addConstant) #Log-transforming if logTransform: print "Log transforming phenotype" phed.logTransform(phenotypeIndex) #Converting phenotypes to Ranks elif phenotypeRanks: phed.transformToRanks(phenotypeIndex) if not chr: snpsDataset = snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5]) kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds, [1, 2, 3, 4, 5]) else: snpsDataset = snpsdata.SNPsDataSet(newSnpsds, [chr]) kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds, [chr]) phenotypeName = phed.getPhenotypeName(phenotypeIndex) sys.stdout.flush() if testRobustness: print "Starting a robustness test" allSNPs = [] for snpsd in snpsDataset.snpsDataList: allSNPs += snpsd.snps phenVals = phed.getPhenVals(phenotypeIndex) _robustness_test_(allSNPs, phenVals, rFile, filter=permutationFilter) sys.exit(0) if useLinearRegress: phenVals = phed.getPhenVals(phenotypeIndex) d0 = {} d0["phen"] = phenVals dh = {} dh["phen"] = phenVals import rpy, gc if regressionCofactors: #Adds ler and col as cofactors import pickle f = open(regressionCofactors, "r") co_factors = pickle.load(f) f.close() #inserting co factors into model for factor in co_factors: d[factor] = co_factors[factor] import analyzeHaplotype as ah (ler_factor, col_factor) = ah.getLerAndColAccessions(newSnpsds, True) if FriColAsCofactor: d0["col"] = col_factor dh["col"] = col_factor if FriLerAsCofactor: d0["ler"] = ler_factor dh["ler"] = ler_factor chr_pos_pvals = [] stats = [] sys.stdout.write("Applying the linear model") sys.stdout.flush() for i in range(0, len(newSnpsds)): #[3]:# snpsd = newSnpsds[i] sys.stdout.write("|") sys.stdout.flush() gc.collect( ) #Calling garbage collector, in an attempt to clean up memory.. for j in range(0, len(snpsd.snps)): if j % 5000 == 0: sys.stdout.write(".") sys.stdout.flush() #if snpsd.positions[j]>1700000: # break snp = snpsd.snps[j] d0["snp"] = snp try: rpy.set_default_mode(rpy.NO_CONVERSION) aov0 = rpy.r.aov(r("phen ~ ."), data=d0) aovh = rpy.r.aov(r("phen ~ ."), data=dh) rpy.set_default_mode(rpy.BASIC_CONVERSION) s0 = rpy.r.summary(aov0) sh = rpy.r.summary(aovh) #print s0,sh rss_0 = s0['Sum Sq'][-1] if type(sh['Sum Sq']) != float: rss_h = sh['Sum Sq'][-1] else: rss_h = sh['Sum Sq'] f = (rss_h - rss_0) / (rss_0 / (len(phenVals) - len(d0) + 1)) pval = rpy.r.pf(f, 1, len(phenVals), lower_tail=False) except Exception, err_str: print "Calculating p-value failed" #,err_str pval = 1.0 #print "dh:",dh #print "d0:",d0 #print "rss_h,rss_0:",rss_h,rss_0 #print "f,p:",f,pval chr_pos_pvals.append([i + 1, snpsd.positions[j], pval]) mafc = min(snp.count(snp[0]), len(snp) - snp.count(snp[0])) maf = mafc / float(len(snp)) stats.append([maf, mafc]) sys.stdout.write("\n") #Write out to a result file sys.stdout.write("Writing results to file\n") sys.stdout.flush() pvalFile = rFile + ".pvals" f = open(pvalFile, "w") f.write("Chromosome,position,p-value,marf,maf\n") for i in range(0, len(chr_pos_pvals)): chr_pos_pval = chr_pos_pvals[i] stat = stats[i] f.write( str(chr_pos_pval[0]) + "," + str(chr_pos_pval[1]) + "," + str(chr_pos_pval[2]) + "," + str(stat[0]) + "," + str(stat[1]) + "\n") f.close() #Plot results print "Generating a GW plot." phenotypeName = phed.getPhenotypeName(phenotypeIndex) res = gwaResults.Result(pvalFile, name="LM_" + phenotypeName, phenotypeID=phenotypeIndex) res.negLogTransform() pngFile = pvalFile + ".png" plotResults.plotResult(res, pngFile=pngFile, percentile=90, type="pvals", ylab="$-$log$_{10}(p)$", plotBonferroni=True, usePylab=False)
def calc_directly_std_rates(summset, popset, stdpopset=None, conflev=0.95, basepop=100000, timeinterval='years', ci_method='dobson', popset_popcol='_freq_', stdpopset_popcol='_stdpop_', axis=0, debug=False): """ Calculate Directly Standardised Population Rates summset is a summary dataset of counts of events for the population-of-interest being compared to the standard population. popset is the stratified population counts for the population-of-interest stdpopset is the stratified population counts for the standard population """ from rpy import r, get_default_mode, set_default_mode, BASIC_CONVERSION alpha = get_alpha(conflev) if ci_method not in ('dobson', 'ff'): raise Error('Only Dobson et al. (dobson) and Fay-Feuer (ff) methods ' 'for confidence intervals currently implemented') if not popset.has_column(popset_popcol): raise Error('Denominator population dataset %r does not have a ' '%r column' % (popset.label or popset.name, popset_popcol)) if stdpopset is not None and not stdpopset.has_column(stdpopset_popcol): raise Error('Standard population dataset %r does not have a ' '%r column' % (stdpopset.label or stdpopset.name, stdpopset_popcol)) st = time.time() r_mode = get_default_mode() try: set_default_mode(BASIC_CONVERSION) # We turn the summset into an Ncondcols-dimensional matrix summtab = CrossTab.from_summset(summset) if stdpopset is not None: # Then attempt to do the same to the stdpop data, summing any # axes not required and replicate any missing until we have an # array the same shape as the summtab array. stdtab = CrossTab.from_summset(stdpopset, shaped_like=summtab) stdtab.collapse_axes_not_in(summtab) stdtab.replicate_axes(summtab) stdpop = stdtab[stdpopset_popcol].data.astype(Numeric.Float64) # The population dataset must have at least as many dimensions as # summary dataset. Any additional axes are eliminated by summing. # any missing axes are created by replication. poptab = CrossTab.from_summset(popset, shaped_like=summtab) poptab.collapse_axes_not_in(summtab) poptab.replicate_axes(summtab) popfreq = poptab[popset_popcol].data.astype(Numeric.Float64) # Manufacture a CrossTab for the result, with one less axis (the first) result = summtab.empty_copy() del result.axes[axis] if stdpopset is not None: sum_stdpop = sumaxis(stdpop) stdwgts = stdpop / sum_stdpop stdpop_sq = stdpop**2 sum_stdpop_sq = sum_stdpop**2 ffwi = stdwgts / popfreq ffwm = MA.maximum(MA.ravel(ffwi)) basepop = float(basepop) for table, name, n_add, l_add in just_freq_tables(summtab): # avoid integer overflows... summfreq = table.data.astype(Numeric.Float64) strata_rate = summfreq / popfreq result.add_table('summfreq' + n_add, data=sumaxis(summfreq, axis), label='Total events' + l_add) result.add_table('popfreq' + n_add, data=sumaxis(popfreq, axis), label='Total person-' + timeinterval + ' at risk' + l_add) if stdpopset is not None: std_strata_summfreq = summfreq * Numeric.where( MA.getmask(stdwgts), 0., 1.) wgtrate = strata_rate * stdwgts result.add_table('std_strata_summfreq' + n_add, data=sumaxis(std_strata_summfreq, axis), label="Total events in standard strata" + l_add) # Crude rate cr = sumaxis(summfreq, axis) / sumaxis(popfreq, axis) * basepop result.add_table('cr' + n_add, data=cr, label='Crude Rate per ' + '%d' % basepop + ' person-' + timeinterval + l_add) if alpha is not None: # CIs for crude rate count = sumaxis(summfreq, axis) count_shape = count.shape count_flat = MA.ravel(count) totpop = sumaxis(popfreq, axis) assert totpop.shape == count.shape totpop_flat = MA.ravel(totpop) cr_ll = Numeric.empty(len(count_flat), typecode=Numeric.Float64) cr_ul = Numeric.empty(len(count_flat), typecode=Numeric.Float64) cr_ll_mask = Numeric.zeros(len(count_flat), typecode=Numeric.Int8) cr_ul_mask = Numeric.zeros(len(count_flat), typecode=Numeric.Int8) for i, v in enumerate(count_flat): try: if v == 0: cr_ll[i] = 0.0 else: cr_ll[i] = ( (r.qchisq(alpha / 2., df=2.0 * v) / 2.0) / totpop_flat[i]) * basepop cr_ul[i] = ( (r.qchisq(1. - alpha / 2., df=2.0 * (v + 1)) / 2.0) / totpop_flat[i]) * basepop except: cr_ll[i] = 0.0 cr_ul[i] = 0.0 cr_ll_mask[i] = 1 cr_ul_mask[i] = 1 cr_ll = MA.array(cr_ll, mask=cr_ll_mask, typecode=MA.Float64) cr_ul = MA.array(cr_ul, mask=cr_ul_mask, typecode=MA.Float64) cr_ll.shape = count_shape cr_ul.shape = count_shape cr_base = 'Crude rate %d%%' % (100.0 * conflev) result.add_table('cr_ll' + n_add, data=cr_ll, label=cr_base + ' lower confidence limit ' + l_add) result.add_table('cr_ul' + n_add, data=cr_ul, label=cr_base + ' upper confidence limit ' + l_add) if stdpopset is not None: # Directly Standardised Rate dsr = sumaxis(wgtrate, axis) result.add_table('dsr' + n_add, data=dsr * basepop, label='Directly Standardised Rate per ' + '%d' % basepop + ' person-' + timeinterval + l_add) # Confidence Intervals if alpha is None or name != '_freq_': # Can only calculate confidence intervals on freq cols continue if ci_method == 'dobson': # Dobson et al method # see: Dobson A, Kuulasmaa K, Eberle E, Schere J. Confidence intervals for weighted sums # of Poisson parameters, Statistics in Medicine, Vol. 10, 1991, pp. 457-62. # se_wgtrate = summfreq*((stdwgts/(popfreq/basepop))**2) se_wgtrate = summfreq * ((stdwgts / (popfreq))**2) stderr = stdpop_sq * strata_rate * (1.0 - strata_rate) se_rate = sumaxis(se_wgtrate, axis) sumsei = sumaxis(stderr, axis) total_freq = sumaxis(std_strata_summfreq, axis) # get shape of total_freq total_freq_shape = total_freq.shape total_freq_flat = MA.ravel(total_freq) # flat arrays to hold results and associated masks l_lam = Numeric.empty(len(total_freq_flat), typecode=Numeric.Float64) u_lam = Numeric.empty(len(total_freq_flat), typecode=Numeric.Float64) l_lam_mask = Numeric.zeros(len(total_freq_flat), typecode=Numeric.Int8) u_lam_mask = Numeric.zeros(len(total_freq_flat), typecode=Numeric.Int8) conflev_l = (1 - conflev) / 2.0 conflev_u = (1 + conflev) / 2.0 for i, v in enumerate(total_freq_flat): try: if v == 0.: u_lam[i] = -math.log(1 - conflev) l_lam[i] = 0.0 else: l_lam[i] = r.qgamma(conflev_l, v, scale=1.) u_lam[i] = r.qgamma(conflev_u, v + 1., scale=1.) except: l_lam[i] = 0.0 u_lam[i] = 0.0 l_lam_mask[i] = 1 u_lam_mask[i] = 1 l_lam = MA.array(l_lam, mask=l_lam_mask, typecode=MA.Float64) u_lam = MA.array(u_lam, mask=u_lam_mask, typecode=MA.Float64) l_lam.shape = total_freq_shape u_lam.shape = total_freq_shape dsr_ll = dsr + (((se_rate / total_freq)**0.5) * (l_lam - total_freq)) dsr_ul = dsr + (((se_rate / total_freq)**0.5) * (u_lam - total_freq)) elif ci_method == 'ff': # Fay and Feuer method # see: Fay MP, Feuer EJ. Confidence intervals for directly standardized rates: # a method based on the gamma distribution. Statistics in Medicine 1997 Apr 15;16(7):791-801. ffvari = summfreq * ffwi**2.0 ffvar = sumaxis(ffvari, axis) dsr_flat = Numeric.ravel(MA.filled(dsr, 0)) dsr_shape = dsr.shape ffvar_flat = Numeric.ravel(MA.filled(ffvar, 0)) # flat arrays to hold results and associated masks dsr_ll = Numeric.empty(len(dsr_flat), typecode=Numeric.Float64) dsr_ul = Numeric.empty(len(dsr_flat), typecode=Numeric.Float64) dsr_ll_mask = Numeric.zeros(len(dsr_flat), typecode=Numeric.Int8) dsr_ul_mask = Numeric.zeros(len(dsr_flat), typecode=Numeric.Int8) for i, y in enumerate(dsr_flat): try: dsr_ll[i] = (ffvar_flat[i] / (2.0 * y)) * r.qchisq( alpha / 2., df=(2.0 * (y**2.) / ffvar_flat[i])) dsr_ul[i] = ((ffvar_flat[i] + (ffwm**2.0)) / (2.0 * (y + ffwm))) * r.qchisq( 1. - alpha / 2., df=((2.0 * ((y + ffwm)**2.0)) / (ffvar_flat[i] + ffwm**2.0))) except: dsr_ll[i] = 0.0 dsr_ul[i] = 0.0 dsr_ll_mask[i] = 1 dsr_ul_mask[i] = 1 dsr_ll = MA.array(dsr_ll, mask=dsr_ll_mask, typecode=MA.Float64) dsr_ul = MA.array(dsr_ul, mask=dsr_ul_mask, typecode=MA.Float64) dsr_ll.shape = dsr_shape dsr_ul.shape = dsr_shape result.add_table('dsr_ll' + n_add, data=dsr_ll * basepop, label='DSR ' + '%d' % (100.0 * conflev) + '% lower confidence limit' + l_add) result.add_table('dsr_ul' + n_add, data=dsr_ul * basepop, label='DSR ' + '%d' % (100.0 * conflev) + '% upper confidence limit' + l_add) finally: set_default_mode(r_mode) soom.info('calc_directly_std_rates took %.03f' % (time.time() - st)) if stdpopset is not None: name = 'dir_std_rates_' + summset.name label = 'Directly Standardised Rates for ' + (summset.label or summset.name) else: name = 'crude_rates_' + summset.name label = 'Crude Rates for ' + (summset.label or summset.name) if conflev: label += ' (%g%% conf. limits)' % (conflev * 100) if debug: global vars vars = Vars(locals()) return result.to_summset(name, label=label)
def lm_fit(self, lm_instance, go_no2prediction_space, bit_string, curs=None, lm_table=None): """ 02-28-05 linear model fitting here 03-08-05 grouping and accumulating before do linear model fitting, see log of 2005, section 'linear model overfitting' for detail. 03-27-05 Use glm of R to do logistic regression 06-30-05 add cluster_size add bit_string to control which parameter should be enabled. 07-04-05 add connectivity_2nd 07-06-05 add logistic 11-09-05 extend coeff_list and coeff_p_value_list restructure the list, go_no2lm_results[go_no] --data_prepare --submit """ sys.stderr.write("Linear Model Fitting...\n") go_no2lm_results = {} #06-30-05 setup the formula_list based on bit_string coeff_name_list = ['p_value', 'recurrence', 'connectivity', 'cluster_size', 'connectivity_2nd'] formula_list = [] for i in range(len(bit_string)): if bit_string[i] == '1': formula_list.append(coeff_name_list[i]) for (go_no,data) in go_no2prediction_space.iteritems(): sys.stderr.write("%s prediction entries from %s.\n"%(len(data), go_no)) #11-09-05 extend coeff_list and coeff_p_value_list coeff_list = [0]*7 #intercept, p_value, recurrence, connectivity, cluster_size coeff_p_value_list = [1]*7 index = 0 #06-30-05 the pointer for summary_stat if len(data)<=50: #two few data continue #convert it to a 2d array data = array(data) """ data_frame = r("d=data.frame(p_value=c(%s),recurrence=c(%s),connectivity=c(%s), is_correct=c(%s))"%(repr(list(data[:,0]))[1:-1], \ repr(list(data[:,1]))[1:-1], repr(list(data[:,2]))[1:-1], repr(list(data[:,3]))[1:-1])) lm_result = r("lm_result=glm(is_correct~p_value+recurrence+connectivity, data=d,family=binomial)") significance_dict = r("summary(lm_result)") print significance_dict['coefficients'] """ set_default_mode(NO_CONVERSION) #04-07-05 data_frame = r.as_data_frame({"p_value":data[:,0], "recurrence":data[:,1], "connectivity":data[:,2], \ "cluster_size":data[:,3], "connectivity_2nd":data[:,4], "is_correct":data[:,-1]}) #06-30-05 -1 denotes is_correct if self.logistic: lm_result = r.glm(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, family=r("binomial")) else: lm_result = r.glm(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame) #06-30-05 use formula_list set_default_mode(BASIC_CONVERSION) #04-07-05 #04-07-05 r.summary() requires lm_result in NO_CONVERSION state summary_stat = r.summary(lm_result) if self.debug: print "everything about coefficients from function", go_no, "is" print summary_stat['coefficients'] #p-values of coefficients """ #04-07-05 convert to python dictionary form lm_result = lm_result.as_py() coeff_list = [lm_result["coefficients"]["(Intercept)"], lm_result["coefficients"]["p_value"], \ lm_result["coefficients"]["recurrence"], lm_result["coefficients"]["connectivity"], \ lm_result["coefficients"]["cluster_size"], \ summary_stat['coefficients'][0][-1], summary_stat['coefficients'][1][-1],\ summary_stat['coefficients'][2][-1], summary_stat['coefficients'][3][-1],\ summary_stat['coefficients'][4][-1], 1] #the last entry is score_cut_off, replaced later in get_score_cut_off() #06-30-05 add corresponding p-values """ #06-30-05 0 in summary_stat['coefficients'] is intercept coeff_list[0] = summary_stat['coefficients'][0][0] #0 is the coefficient coeff_p_value_list[0] = summary_stat['coefficients'][0][-1] #-1 is the corresponding p-value #06-30-05 fill in other efficients based on bit_string, NOTE i+1 for i in range(len(bit_string)): if bit_string[i] == '1': index+=1 coeff_list[i+1] = summary_stat['coefficients'][index][0] #0 is the coefficient coeff_p_value_list[i+1] = summary_stat['coefficients'][index][-1] #-1 is the corresponding p-value #11-09-05 restructure the following list go_no2lm_results[go_no] = [coeff_list, coeff_p_value_list, 1] #the last entry is score_cut_off, replaced later in get_score_cut_off() sys.stderr.write("done.\n") return go_no2lm_results
def make_L( data, direction='S', z=None, ): """ Define the along track distance from one reference direction define the cardinal direction priority (N,S,W or E). S means that the reference will be the southern most point z define the bathymetry, if defined, the closest point to that bathymetry will be the reference. In case of cross this bathymetry more than once, the direction criteria is used to distinguish. """ from fluid.common.distance import distance all_cycles_data = join_cycles(data) if z == None: import rpy #for t in topex.invert_keys(data): for t in all_cycles_data: rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = rpy.r.lm(rpy.r("y ~ x"), data=rpy.r.data_frame( x=all_cycles_data[t]['Longitude'], y=all_cycles_data[t]['Latitude'])) rpy.set_default_mode(rpy.BASIC_CONVERSION) coef = rpy.r.coef(linear_model) if direction == 'S': lat0 = all_cycles_data[t]['Latitude'].min() - 1 lon0 = (lat0 - coef['(Intercept)']) / coef['x'] L_correction = distance(all_cycles_data[t]['Latitude'], all_cycles_data[t]['Longitude'], lat0, lon0).min() for c in invert_keys(data)[t]: data[c][t]['L'] = distance(data[c][t]['Latitude'], data[c][t]['Longitude'], lat0, lon0) - L_correction # This bathymetric method was only copied from an old code. This should be atleast # changed, if not removed. elif method == 'bathymetric': import rpy for t in all_cycles_data: # First define the near coast values. idSouth = numpy.argmin(all_cycles_data[t]['Latitude']) L_tmp = distance(all_cycles_data[t]['Latitude'], all_cycles_data[t]['Longitude'], all_cycles_data[t]['Latitude'][idSouth], all_cycles_data[t]['Longitude'][idSouth]) idNearCoast = L_tmp.data < 400e3 if min(all_cycles_data[t]['Bathy'][idNearCoast]) > -z: idNearCoast = L_tmp.data < 600e3 # Then calculate the distance to a reference rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = rpy.r.lm(rpy.r("y ~ x"), data=rpy.r.data_frame( x=all_cycles_data[t]['Longitude'], y=all_cycles_data[t]['Latitude'])) rpy.set_default_mode(rpy.BASIC_CONVERSION) coef = rpy.r.coef(linear_model) lat0 = all_cycles_data[t]['Latitude'].min() - 1 lon0 = (lat0 - coef['(Intercept)']) / coef['x'] #L = distance(,lon,lat0,lon0) # #id0 = numpy.argmin(numpy.absolute(all_cycles_data[t]['Bathy'][idNearCoast])) idref = numpy.argmin( numpy.absolute(all_cycles_data[t]['Bathy'][idNearCoast] + z)) #L_correction = distance(all_cycles_data[t]['Latitude'][idNearCoast][idref],all_cycles_data[t]['Longitude'][idNearCoast][idref],all_cycles_data[t]['Latitude'][idNearCoast][idref],all_cycles_data[t]['Longitude'][idNearCoast][idref]) L_correction = distance( all_cycles_data[t]['Latitude'][idNearCoast][idref], all_cycles_data[t]['Longitude'][idNearCoast][idref], lat0, lon0) for c in topex.invert_keys(data)[t]: #data[c][t]['L'] = distance(data[c][t]['Latitude'],data[c][t]['Longitude'],all_cycles_data[t]['Latitude'][idNearCoast][id0],all_cycles_data[t]['Longitude'][idNearCoast][id0]) - L_correction data[c][t]['L'] = distance(data[c][t]['Latitude'], data[c][t]['Longitude'], lat0, lon0) - L_correction # return
is_correct_list = [] for row in reader: p_value, recurrence, connectivity, cluster_size, gradient, gene_no, go_no, is_correct = row data.append([float(p_value), float(recurrence), float(connectivity), float(cluster_size), float(gradient), int(gene_no), int(go_no), int(is_correct)]) del reader return data, is_correct_list known_fname = '/tmp/hs_fim_92m5x25bfsdfl10q0_7gf1.known' unknown_fname = '/tmp/hs_fim_92m5x25bfsdfl10q0_7gf1.unknown' known_data, known_is_correct_list = read_data(known_fname) unknown_data, unknown_is_correct_list = read_data(unknown_fname) from numarray import array from rpy import r, set_default_mode,NO_CONVERSION,BASIC_CONVERSION set_default_mode(NO_CONVERSION) #pack data into data_frame known_data = array(known_data) known_data_frame = r.as_data_frame({"p_value":known_data[:,0], "recurrence":known_data[:,1], "connectivity":known_data[:,2], \ "cluster_size":known_data[:,3], "gradient":known_data[:,4]}) unknown_data = array(unknown_data) unknown_data_frame = r.as_data_frame({"p_value":unknown_data[:,0], "recurrence":unknown_data[:,1], "connectivity":unknown_data[:,2], \ "cluster_size":unknown_data[:,3], "gradient":unknown_data[:,4]}) #start to call randomF.r to run randomForest r.library('randomForest') r.source('randomF.r') #rf_model still needs to be in pure R object rf_model = r.randomF(known_data_frame, known_data[:,-1]) set_default_mode(BASIC_CONVERSION) unknown_pred = r.predictRandomF(rf_model, unknown_data_frame)
# if you have rpy installed, use it to test the results have_rpy = False try: print "\n" print "="*30 print "Validating OLS results in R" print "="*30 import rpy have_rpy = True except ImportError: print "\n" print "="*30 print "Validating OLS-class results in R" print "="*30 print "rpy is not installed" print "="*30 if have_rpy: y = data[:,0] x1 = data[:,1] x2 = data[:,2] x3 = data[:,3] x4 = data[:,4] rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = rpy.r.lm(rpy.r("y ~ x1 + x2 + x3 + x4"), data = rpy.r.data_frame(x1=x1,x2=x2,x3=x3,x4=x4,y=y)) rpy.set_default_mode(rpy.BASIC_CONVERSION) print linear_model.as_py()['coefficients'] summary = rpy.r.summary(linear_model) print summary
def krige_to_grid(grid_fname, obs_x, obs_y, obs_data, vgm_par): """Interpolate point data onto a grid using Kriging. Interpolate point data onto a regular rectangular grid of square cells using Kriging with a predefined semi-variogram. The observed data locations must be specified in the same projection and coordinate system as the grid, which is defined in an ArcGIS raster file. Parameters ---------- grid_fname : string Filename of an ArcGIS float grid raster defining the required grid to Krige onto. All cells are included regardless of their value. obs_x : array_like The x coordinates of the observation locations. obs_y : array_like The y coordinates of the observation locations. obs_data : array_like The data values at the observation locations. vgm : dict A dictionary describing the semi-variogram model. Required keys are: 'model' can be one of {'Lin', 'Exp', 'Sph', 'Gau'} 'nugget' must be a scalar 'range' must be a scalar 'sill' must be a scalar Returns ------- kriged_est : 2darray A 2D array containing the Kriged estimates at each point on the specified rectangular grid. Notes ----- This function requires that R, RPy and the R gstat library are correctly installed. """ grid, headers = arcfltgrid.read(grid_fname) cols = headers[0] rows = headers[1] x0 = headers[2] y0 = headers[3] cell_size = headers[4] # TO DO: adjust x0, y0 by 0.5*cell_size if llcorner.. # define the grid (pixel centre's) xt, yt = np.meshgrid( np.linspace(x0, x0 + (cols - 1) * cell_size, num=cols), np.linspace(y0 + (rows - 1) * cell_size, y0, num=rows)) xt = xt.flatten() yt = yt.flatten() # Krige using gstat via RPy r.library('gstat') rpy.set_default_mode(rpy.NO_CONVERSION) obs_frame = r.data_frame(x=obs_x, y=obs_y, data=obs_data) target_grid = r.data_frame(x=xt, y=yt) v = r.vgm(vgm_par['sill'], vgm_par['model'], vgm_par['range'], vgm_par['nugget']) result = r.krige(r('data ~ 1'), r('~ x + y'), obs_frame, target_grid, model=v) rpy.set_default_mode(rpy.BASIC_CONVERSION) result = result.as_py() kriged_est = np.array(result['var1.pred']) kriged_est = kriged_est.reshape(rows, cols) return kriged_est