def rpart_predict(self, fit_model, data): """ 11-23-05 split from rpart_fit_and_predict() """ if self.debug: sys.stderr.write("Doing rpart_predict...\n") data = array(data) set_default_mode(NO_CONVERSION) data_frame = r.as_data_frame( { "p_value": data[:, 0], "recurrence": data[:, 1], "connectivity": data[:, 2], "cluster_size": data[:, 3], "gradient": data[:, 4], "is_correct": data[:, -1], } ) set_default_mode(BASIC_CONVERSION) pred = r.predict(fit_model, data_frame, type=["class"]) # 11-17-05 type=c("class") del data_frame if self.debug: sys.stderr.write("Done rpart_predict.\n") return pred
def randomForest_predict(self, fit_model, data): """ 03-17-06 2006-10-30, add avg_degree(vertex_gradient) and unknown_cut_off """ if self.debug: sys.stderr.write("Predicting by randomForest...\n") data = array(data) set_default_mode(NO_CONVERSION) data_frame = r.as_data_frame( { "p_value": data[:, 0], "recurrence": data[:, 1], "connectivity": data[:, 2], "cluster_size": data[:, 3], "gradient": data[:, 4], "avg_degree": data[:, 5], "unknown_ratio": data[:, 6], "is_correct": r.factor(data[:, -1]), } ) set_default_mode(BASIC_CONVERSION) pred = r.predict(fit_model, data_frame) del data_frame if self.debug: sys.stderr.write("Done randomForest prediction.\n") return pred
def rpart_fit_and_predict(self, all_data, known_data, rpart_cp, loss_matrix, prior_prob, bit_string='11111'): """ 11-09-05 1st use known_data to get the fit model 2nd use the fit model to do prediction on all_data, result is prob for each class 11-09-05 add rpart_cp 11-17-05 add loss_matrix, prior_prob return two pred """ sys.stderr.write("rpart fitting and predicting...\n") r.library("rpart") coeff_name_list = ['p_value', 'recurrence', 'connectivity', 'cluster_size', 'gradient'] formula_list = [] for i in range(len(bit_string)): if bit_string[i] == '1': formula_list.append(coeff_name_list[i]) #11-17-05 transform into array all_data = array(all_data) known_data = array(known_data) set_default_mode(NO_CONVERSION) data_frame = r.as_data_frame({"p_value":known_data[:,0], "recurrence":known_data[:,1], "connectivity":known_data[:,2], \ "cluster_size":known_data[:,3], "gradient":known_data[:,4], "is_correct":known_data[:,-1]}) if prior_prob: prior_prob = [prior_prob, 1-prior_prob] #get the full list fit = r.rpart(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp),\ parms=r.list(prior=prior_prob, loss=r.matrix(loss_matrix) ) ) else: fit = r.rpart(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp),\ parms=r.list(loss=r.matrix(loss_matrix) ) ) set_default_mode(BASIC_CONVERSION) pred_training = r.predict(fit, data_frame, type=["class"]) del data_frame set_default_mode(NO_CONVERSION) all_data_frame = r.as_data_frame({"p_value":all_data[:,0], "recurrence":all_data[:,1], "connectivity":all_data[:,2], \ "cluster_size":all_data[:,3], "gradient":all_data[:,4], "is_correct":all_data[:,-1]}) set_default_mode(BASIC_CONVERSION) pred = r.predict(fit, all_data_frame, type=["class"]) #11-17-05 type=c("class") del all_data_frame sys.stderr.write("Done rpart fitting and predicting.\n") return pred, pred_training
def randomForest_fit(self, known_data, parameter_list, bit_string="1111111"): """ 03-17-06 2006-10-302006-10-30, add avg_degree(vertex_gradient) and unknown_cut_off """ if self.debug: sys.stderr.write("Fitting randomForest...\n") mty = parameter_list[0] from rpy import r r._libPaths( os.path.join(lib_path, "R") ) # better than r.library("randomForest", lib_loc=os.path.join(lib_path, "R")) (see plone doc) r.library("randomForest") coeff_name_list = [ "p_value", "recurrence", "connectivity", "cluster_size", "gradient", "avg_degree", "unknown_ratio", ] # 2006-10-30 formula_list = [] for i in range(len(bit_string)): if bit_string[i] == "1": formula_list.append(coeff_name_list[i]) formula = r("is_correct~%s" % "+".join(formula_list)) known_data = array(known_data) set_default_mode(NO_CONVERSION) data_frame = r.as_data_frame( { "p_value": known_data[:, 0], "recurrence": known_data[:, 1], "connectivity": known_data[:, 2], "cluster_size": known_data[:, 3], "gradient": known_data[:, 4], "avg_degree": known_data[:, 5], "unknown_ratio": known_data[:, 6], "is_correct": r.factor(known_data[:, -1]), } ) # 03-17-06, watch r.factor #2006-10-30 if mty > 0: fit = r.randomForest(formula, data=data_frame, mty=mty) else: fit = r.randomForest(formula, data=data_frame) del data_frame if self.debug: sys.stderr.write("Done fitting randomForest.\n") return fit
def __call__(self, gen, semis, morpho_f, phyllo_f, sen_f, prev_f, alpha_f, plan_f, fname): os.chdir(path) r.source('writeparsGL.R') r.readExcell.local_mode(0) r.as_data_frame.local_mode(0) bloc = "B1" #B1 par defaut (changer si bcp de variabilite entre blocs) # dimensions des organes (en, gaine, longueur largeur de feuilles) f(gen,semis) morpho = r.readExcell(morpho_f, gen+'_'+semis) # tableau angles des feuilles f(gen,semis) anglesPrev = r.readExcell(prev_f, gen+'_'+semis) # dynamique nbvis et nblig dev = r.readExcell(phyllo_f, gen+'_'+semis) n_max = round(dev.as_py()['nbvis'][-1]) # dynamique de senescence r.readExcell.local_mode(1) sen = r.readExcell(sen_f, gen+'_'+semis) r.readExcell.local_mode(0) sen['TT'].append(3000.) sen['nb_sen'].append(n_max+1) sen = r.as_data_frame(sen) # formes de limbes alpha = r.readExcell(alpha_f,gen) #plan parcelle plan = r.readExcell(plan_f, gen+'_'+semis+'_'+bloc) #fname = "_05_geomF2_S1B1.h" txt1 = r.writenumf (fname,dev,morpho,anglesPrev) print txt1 r.writePO (fname) r.writecarto(fname, plan, 0.8, 0.125) txt2 = r.writedimkin(fname, dev, sen, morpho) print txt2 r.writeprev(fname, alpha, anglesPrev, dev) os.chdir(path_ini) return join(join(path, 'temp'),fname)
def __call__(self, gen, semis, morpho_f, phyllo_f, sen_f, prev_f, alpha_f, plan_f, fname): os.chdir(path) r.source('writeparsGL.R') r.readExcell.local_mode(0) r.as_data_frame.local_mode(0) bloc = "B1" #B1 par defaut (changer si bcp de variabilite entre blocs) # dimensions des organes (en, gaine, longueur largeur de feuilles) f(gen,semis) morpho = r.readExcell(morpho_f, gen + '_' + semis) # tableau angles des feuilles f(gen,semis) anglesPrev = r.readExcell(prev_f, gen + '_' + semis) # dynamique nbvis et nblig dev = r.readExcell(phyllo_f, gen + '_' + semis) n_max = round(dev.as_py()['nbvis'][-1]) # dynamique de senescence r.readExcell.local_mode(1) sen = r.readExcell(sen_f, gen + '_' + semis) r.readExcell.local_mode(0) sen['TT'].append(3000.) sen['nb_sen'].append(n_max + 1) sen = r.as_data_frame(sen) # formes de limbes alpha = r.readExcell(alpha_f, gen) #plan parcelle plan = r.readExcell(plan_f, gen + '_' + semis + '_' + bloc) #fname = "_05_geomF2_S1B1.h" txt1 = r.writenumf(fname, dev, morpho, anglesPrev) print txt1 r.writePO(fname) r.writecarto(fname, plan, 0.8, 0.125) txt2 = r.writedimkin(fname, dev, sen, morpho) print txt2 r.writeprev(fname, alpha, anglesPrev, dev) os.chdir(path_ini) return join(join(path, 'temp'), fname)
def rpart_fit(self, known_data, parameter_list, bit_string="11111"): """ 11-09-05 1st use known_data to get the fit model 2nd use the fit model to do prediction on all_data, result is prob for each class 11-09-05 add rpart_cp 11-17-05 add loss_matrix, prior_prob return two pred 11-23-05 split fit and predict. rpart_fit_and_predict() is split into rpart_fit() and rpart_predict() 11-27-05 r cleanup 03-17-06 use parameter_list instead """ if self.debug: sys.stderr.write("Doing rpart_fit...\n") # 03-17-06 rpart_cp, loss_matrix, prior_prob = parameter_list # 11-27-05 r cleanup from rpy import r r.library("rpart") coeff_name_list = ["p_value", "recurrence", "connectivity", "cluster_size", "gradient"] formula_list = [] for i in range(len(bit_string)): if bit_string[i] == "1": formula_list.append(coeff_name_list[i]) # 11-17-05 transform into array known_data = array(known_data) set_default_mode(NO_CONVERSION) data_frame = r.as_data_frame( { "p_value": known_data[:, 0], "recurrence": known_data[:, 1], "connectivity": known_data[:, 2], "cluster_size": known_data[:, 3], "gradient": known_data[:, 4], "is_correct": known_data[:, -1], } ) if prior_prob: prior_prob = [prior_prob, 1 - prior_prob] # get the full list fit = r.rpart( r("is_correct~%s" % "+".join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp), parms=r.list(prior=prior_prob, loss=r.matrix(loss_matrix)), ) else: fit = r.rpart( r("is_correct~%s" % "+".join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp), parms=r.list(loss=r.matrix(loss_matrix)), ) del data_frame if self.debug: sys.stderr.write("Done rpart_fit.\n") return fit
data.append([float(p_value), float(recurrence), float(connectivity), float(cluster_size), float(gradient), int(gene_no), int(go_no), int(is_correct)]) del reader return data, is_correct_list known_fname = '/tmp/hs_fim_92m5x25bfsdfl10q0_7gf1.known' unknown_fname = '/tmp/hs_fim_92m5x25bfsdfl10q0_7gf1.unknown' known_data, known_is_correct_list = read_data(known_fname) unknown_data, unknown_is_correct_list = read_data(unknown_fname) from numarray import array from rpy import r, set_default_mode,NO_CONVERSION,BASIC_CONVERSION set_default_mode(NO_CONVERSION) #pack data into data_frame known_data = array(known_data) known_data_frame = r.as_data_frame({"p_value":known_data[:,0], "recurrence":known_data[:,1], "connectivity":known_data[:,2], \ "cluster_size":known_data[:,3], "gradient":known_data[:,4]}) unknown_data = array(unknown_data) unknown_data_frame = r.as_data_frame({"p_value":unknown_data[:,0], "recurrence":unknown_data[:,1], "connectivity":unknown_data[:,2], \ "cluster_size":unknown_data[:,3], "gradient":unknown_data[:,4]}) #start to call randomF.r to run randomForest r.library('randomForest') r.source('randomF.r') #rf_model still needs to be in pure R object rf_model = r.randomF(known_data_frame, known_data[:,-1]) set_default_mode(BASIC_CONVERSION) unknown_pred = r.predictRandomF(rf_model, unknown_data_frame) rf_model= rf_model.as_py(BASIC_CONVERSION) print rf_model.keys() print rf_model['confusion']
def lm_fit(self, lm_instance, go_no2prediction_space, bit_string, curs=None, lm_table=None): """ 02-28-05 linear model fitting here 03-08-05 grouping and accumulating before do linear model fitting, see log of 2005, section 'linear model overfitting' for detail. 03-27-05 Use glm of R to do logistic regression 06-30-05 add cluster_size add bit_string to control which parameter should be enabled. 07-04-05 add connectivity_2nd 07-06-05 add logistic 11-09-05 extend coeff_list and coeff_p_value_list restructure the list, go_no2lm_results[go_no] --data_prepare --submit """ sys.stderr.write("Linear Model Fitting...\n") go_no2lm_results = {} #06-30-05 setup the formula_list based on bit_string coeff_name_list = ['p_value', 'recurrence', 'connectivity', 'cluster_size', 'connectivity_2nd'] formula_list = [] for i in range(len(bit_string)): if bit_string[i] == '1': formula_list.append(coeff_name_list[i]) for (go_no,data) in go_no2prediction_space.iteritems(): sys.stderr.write("%s prediction entries from %s.\n"%(len(data), go_no)) #11-09-05 extend coeff_list and coeff_p_value_list coeff_list = [0]*7 #intercept, p_value, recurrence, connectivity, cluster_size coeff_p_value_list = [1]*7 index = 0 #06-30-05 the pointer for summary_stat if len(data)<=50: #two few data continue #convert it to a 2d array data = array(data) """ data_frame = r("d=data.frame(p_value=c(%s),recurrence=c(%s),connectivity=c(%s), is_correct=c(%s))"%(repr(list(data[:,0]))[1:-1], \ repr(list(data[:,1]))[1:-1], repr(list(data[:,2]))[1:-1], repr(list(data[:,3]))[1:-1])) lm_result = r("lm_result=glm(is_correct~p_value+recurrence+connectivity, data=d,family=binomial)") significance_dict = r("summary(lm_result)") print significance_dict['coefficients'] """ set_default_mode(NO_CONVERSION) #04-07-05 data_frame = r.as_data_frame({"p_value":data[:,0], "recurrence":data[:,1], "connectivity":data[:,2], \ "cluster_size":data[:,3], "connectivity_2nd":data[:,4], "is_correct":data[:,-1]}) #06-30-05 -1 denotes is_correct if self.logistic: lm_result = r.glm(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, family=r("binomial")) else: lm_result = r.glm(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame) #06-30-05 use formula_list set_default_mode(BASIC_CONVERSION) #04-07-05 #04-07-05 r.summary() requires lm_result in NO_CONVERSION state summary_stat = r.summary(lm_result) if self.debug: print "everything about coefficients from function", go_no, "is" print summary_stat['coefficients'] #p-values of coefficients """ #04-07-05 convert to python dictionary form lm_result = lm_result.as_py() coeff_list = [lm_result["coefficients"]["(Intercept)"], lm_result["coefficients"]["p_value"], \ lm_result["coefficients"]["recurrence"], lm_result["coefficients"]["connectivity"], \ lm_result["coefficients"]["cluster_size"], \ summary_stat['coefficients'][0][-1], summary_stat['coefficients'][1][-1],\ summary_stat['coefficients'][2][-1], summary_stat['coefficients'][3][-1],\ summary_stat['coefficients'][4][-1], 1] #the last entry is score_cut_off, replaced later in get_score_cut_off() #06-30-05 add corresponding p-values """ #06-30-05 0 in summary_stat['coefficients'] is intercept coeff_list[0] = summary_stat['coefficients'][0][0] #0 is the coefficient coeff_p_value_list[0] = summary_stat['coefficients'][0][-1] #-1 is the corresponding p-value #06-30-05 fill in other efficients based on bit_string, NOTE i+1 for i in range(len(bit_string)): if bit_string[i] == '1': index+=1 coeff_list[i+1] = summary_stat['coefficients'][index][0] #0 is the coefficient coeff_p_value_list[i+1] = summary_stat['coefficients'][index][-1] #-1 is the corresponding p-value #11-09-05 restructure the following list go_no2lm_results[go_no] = [coeff_list, coeff_p_value_list, 1] #the last entry is score_cut_off, replaced later in get_score_cut_off() sys.stderr.write("done.\n") return go_no2lm_results