Пример #1
0
    def rpart_predict(self, fit_model, data):
        """
		11-23-05
			split from rpart_fit_and_predict()
		"""
        if self.debug:
            sys.stderr.write("Doing rpart_predict...\n")
        data = array(data)
        set_default_mode(NO_CONVERSION)
        data_frame = r.as_data_frame(
            {
                "p_value": data[:, 0],
                "recurrence": data[:, 1],
                "connectivity": data[:, 2],
                "cluster_size": data[:, 3],
                "gradient": data[:, 4],
                "is_correct": data[:, -1],
            }
        )
        set_default_mode(BASIC_CONVERSION)
        pred = r.predict(fit_model, data_frame, type=["class"])  # 11-17-05 type=c("class")
        del data_frame
        if self.debug:
            sys.stderr.write("Done rpart_predict.\n")
        return pred
Пример #2
0
    def randomForest_predict(self, fit_model, data):
        """
		03-17-06
		2006-10-30, add avg_degree(vertex_gradient) and unknown_cut_off
		"""
        if self.debug:
            sys.stderr.write("Predicting by randomForest...\n")
        data = array(data)
        set_default_mode(NO_CONVERSION)
        data_frame = r.as_data_frame(
            {
                "p_value": data[:, 0],
                "recurrence": data[:, 1],
                "connectivity": data[:, 2],
                "cluster_size": data[:, 3],
                "gradient": data[:, 4],
                "avg_degree": data[:, 5],
                "unknown_ratio": data[:, 6],
                "is_correct": r.factor(data[:, -1]),
            }
        )
        set_default_mode(BASIC_CONVERSION)
        pred = r.predict(fit_model, data_frame)
        del data_frame
        if self.debug:
            sys.stderr.write("Done randomForest prediction.\n")
        return pred
Пример #3
0
	def rpart_fit_and_predict(self, all_data, known_data, rpart_cp, loss_matrix, prior_prob, bit_string='11111'):
		"""
		11-09-05
			1st use known_data to get the fit model
			2nd use the fit model to do prediction on all_data, result is prob for each class
		11-09-05 add rpart_cp
		11-17-05
			add loss_matrix, prior_prob
			return two pred
		"""
		sys.stderr.write("rpart fitting and predicting...\n")
		r.library("rpart")
		coeff_name_list = ['p_value', 'recurrence', 'connectivity', 'cluster_size', 'gradient']
		formula_list = []
		for i in range(len(bit_string)):
			if bit_string[i] == '1':
				formula_list.append(coeff_name_list[i])
		#11-17-05 transform into array
		all_data = array(all_data)
		known_data = array(known_data)
		
		set_default_mode(NO_CONVERSION)
		data_frame = r.as_data_frame({"p_value":known_data[:,0], "recurrence":known_data[:,1], "connectivity":known_data[:,2], \
			"cluster_size":known_data[:,3], "gradient":known_data[:,4], "is_correct":known_data[:,-1]})
		if prior_prob:
			prior_prob = [prior_prob, 1-prior_prob]	#get the full list
			fit = r.rpart(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp),\
				parms=r.list(prior=prior_prob, loss=r.matrix(loss_matrix) ) )
		else:
			fit = r.rpart(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp),\
				parms=r.list(loss=r.matrix(loss_matrix) ) )
		
		set_default_mode(BASIC_CONVERSION)
		pred_training = r.predict(fit, data_frame, type=["class"])
		del data_frame
		
		set_default_mode(NO_CONVERSION)
		all_data_frame = r.as_data_frame({"p_value":all_data[:,0], "recurrence":all_data[:,1], "connectivity":all_data[:,2], \
			"cluster_size":all_data[:,3], "gradient":all_data[:,4], "is_correct":all_data[:,-1]})
		set_default_mode(BASIC_CONVERSION)
		pred = r.predict(fit, all_data_frame, type=["class"])	#11-17-05 type=c("class")
		del all_data_frame
		sys.stderr.write("Done rpart fitting and predicting.\n")
		return pred, pred_training
Пример #4
0
    def randomForest_fit(self, known_data, parameter_list, bit_string="1111111"):
        """
		03-17-06
		2006-10-302006-10-30, add avg_degree(vertex_gradient) and unknown_cut_off
		"""
        if self.debug:
            sys.stderr.write("Fitting randomForest...\n")
        mty = parameter_list[0]

        from rpy import r

        r._libPaths(
            os.path.join(lib_path, "R")
        )  # better than r.library("randomForest", lib_loc=os.path.join(lib_path, "R")) (see plone doc)
        r.library("randomForest")

        coeff_name_list = [
            "p_value",
            "recurrence",
            "connectivity",
            "cluster_size",
            "gradient",
            "avg_degree",
            "unknown_ratio",
        ]  # 2006-10-30
        formula_list = []
        for i in range(len(bit_string)):
            if bit_string[i] == "1":
                formula_list.append(coeff_name_list[i])
        formula = r("is_correct~%s" % "+".join(formula_list))

        known_data = array(known_data)
        set_default_mode(NO_CONVERSION)
        data_frame = r.as_data_frame(
            {
                "p_value": known_data[:, 0],
                "recurrence": known_data[:, 1],
                "connectivity": known_data[:, 2],
                "cluster_size": known_data[:, 3],
                "gradient": known_data[:, 4],
                "avg_degree": known_data[:, 5],
                "unknown_ratio": known_data[:, 6],
                "is_correct": r.factor(known_data[:, -1]),
            }
        )  # 03-17-06, watch r.factor	#2006-10-30

        if mty > 0:
            fit = r.randomForest(formula, data=data_frame, mty=mty)
        else:
            fit = r.randomForest(formula, data=data_frame)

        del data_frame
        if self.debug:
            sys.stderr.write("Done fitting randomForest.\n")
        return fit
Пример #5
0
    def __call__(self, gen, semis, morpho_f, phyllo_f, sen_f, prev_f, alpha_f, plan_f, fname):
        os.chdir(path)
        r.source('writeparsGL.R')
        r.readExcell.local_mode(0)
        r.as_data_frame.local_mode(0)

        bloc = "B1" #B1 par defaut (changer si bcp de variabilite entre blocs)

        # dimensions des organes (en, gaine, longueur largeur de feuilles) f(gen,semis)
        morpho = r.readExcell(morpho_f, gen+'_'+semis)

        # tableau angles des feuilles  f(gen,semis)
        anglesPrev = r.readExcell(prev_f, gen+'_'+semis)

        # dynamique nbvis et nblig  
        dev = r.readExcell(phyllo_f, gen+'_'+semis)
        n_max = round(dev.as_py()['nbvis'][-1])

        # dynamique de senescence
        r.readExcell.local_mode(1)
        sen = r.readExcell(sen_f, gen+'_'+semis)
        r.readExcell.local_mode(0)
        sen['TT'].append(3000.)
        sen['nb_sen'].append(n_max+1)
        sen = r.as_data_frame(sen)

        # formes de limbes
        alpha = r.readExcell(alpha_f,gen)

        #plan parcelle
        plan = r.readExcell(plan_f, gen+'_'+semis+'_'+bloc)


        #fname = "_05_geomF2_S1B1.h"

        txt1 = r.writenumf (fname,dev,morpho,anglesPrev)
        print txt1
        r.writePO (fname)
        r.writecarto(fname, plan, 0.8, 0.125)
        txt2 = r.writedimkin(fname, dev, sen, morpho)
        print txt2
        r.writeprev(fname, alpha, anglesPrev, dev)

        os.chdir(path_ini)

        return join(join(path, 'temp'),fname)
Пример #6
0
    def __call__(self, gen, semis, morpho_f, phyllo_f, sen_f, prev_f, alpha_f,
                 plan_f, fname):
        os.chdir(path)
        r.source('writeparsGL.R')
        r.readExcell.local_mode(0)
        r.as_data_frame.local_mode(0)

        bloc = "B1"  #B1 par defaut (changer si bcp de variabilite entre blocs)

        # dimensions des organes (en, gaine, longueur largeur de feuilles) f(gen,semis)
        morpho = r.readExcell(morpho_f, gen + '_' + semis)

        # tableau angles des feuilles  f(gen,semis)
        anglesPrev = r.readExcell(prev_f, gen + '_' + semis)

        # dynamique nbvis et nblig
        dev = r.readExcell(phyllo_f, gen + '_' + semis)
        n_max = round(dev.as_py()['nbvis'][-1])

        # dynamique de senescence
        r.readExcell.local_mode(1)
        sen = r.readExcell(sen_f, gen + '_' + semis)
        r.readExcell.local_mode(0)
        sen['TT'].append(3000.)
        sen['nb_sen'].append(n_max + 1)
        sen = r.as_data_frame(sen)

        # formes de limbes
        alpha = r.readExcell(alpha_f, gen)

        #plan parcelle
        plan = r.readExcell(plan_f, gen + '_' + semis + '_' + bloc)

        #fname = "_05_geomF2_S1B1.h"

        txt1 = r.writenumf(fname, dev, morpho, anglesPrev)
        print txt1
        r.writePO(fname)
        r.writecarto(fname, plan, 0.8, 0.125)
        txt2 = r.writedimkin(fname, dev, sen, morpho)
        print txt2
        r.writeprev(fname, alpha, anglesPrev, dev)

        os.chdir(path_ini)

        return join(join(path, 'temp'), fname)
Пример #7
0
    def rpart_fit(self, known_data, parameter_list, bit_string="11111"):
        """
		11-09-05
			1st use known_data to get the fit model
			2nd use the fit model to do prediction on all_data, result is prob for each class
		11-09-05 add rpart_cp
		11-17-05
			add loss_matrix, prior_prob
			return two pred
		11-23-05
			split fit and predict. rpart_fit_and_predict() is split into rpart_fit() and rpart_predict()
		11-27-05
			r cleanup
		03-17-06
			use parameter_list instead
		"""
        if self.debug:
            sys.stderr.write("Doing rpart_fit...\n")
            # 03-17-06
        rpart_cp, loss_matrix, prior_prob = parameter_list

        # 11-27-05 r cleanup
        from rpy import r

        r.library("rpart")

        coeff_name_list = ["p_value", "recurrence", "connectivity", "cluster_size", "gradient"]
        formula_list = []
        for i in range(len(bit_string)):
            if bit_string[i] == "1":
                formula_list.append(coeff_name_list[i])
                # 11-17-05 transform into array
        known_data = array(known_data)

        set_default_mode(NO_CONVERSION)
        data_frame = r.as_data_frame(
            {
                "p_value": known_data[:, 0],
                "recurrence": known_data[:, 1],
                "connectivity": known_data[:, 2],
                "cluster_size": known_data[:, 3],
                "gradient": known_data[:, 4],
                "is_correct": known_data[:, -1],
            }
        )
        if prior_prob:
            prior_prob = [prior_prob, 1 - prior_prob]  # get the full list
            fit = r.rpart(
                r("is_correct~%s" % "+".join(formula_list)),
                data=data_frame,
                method="class",
                control=r.rpart_control(cp=rpart_cp),
                parms=r.list(prior=prior_prob, loss=r.matrix(loss_matrix)),
            )
        else:
            fit = r.rpart(
                r("is_correct~%s" % "+".join(formula_list)),
                data=data_frame,
                method="class",
                control=r.rpart_control(cp=rpart_cp),
                parms=r.list(loss=r.matrix(loss_matrix)),
            )
        del data_frame
        if self.debug:
            sys.stderr.write("Done rpart_fit.\n")
        return fit
Пример #8
0
		data.append([float(p_value), float(recurrence), float(connectivity), float(cluster_size), float(gradient), int(gene_no), int(go_no), int(is_correct)])
	del reader
	return data, is_correct_list

known_fname = '/tmp/hs_fim_92m5x25bfsdfl10q0_7gf1.known'
unknown_fname = '/tmp/hs_fim_92m5x25bfsdfl10q0_7gf1.unknown'

known_data, known_is_correct_list = read_data(known_fname)
unknown_data, unknown_is_correct_list = read_data(unknown_fname)

from numarray import array
from rpy import r, set_default_mode,NO_CONVERSION,BASIC_CONVERSION
set_default_mode(NO_CONVERSION)
#pack data into data_frame
known_data = array(known_data)
known_data_frame = r.as_data_frame({"p_value":known_data[:,0], "recurrence":known_data[:,1], "connectivity":known_data[:,2], \
	"cluster_size":known_data[:,3], "gradient":known_data[:,4]})
unknown_data = array(unknown_data)
unknown_data_frame = r.as_data_frame({"p_value":unknown_data[:,0], "recurrence":unknown_data[:,1], "connectivity":unknown_data[:,2], \
	"cluster_size":unknown_data[:,3], "gradient":unknown_data[:,4]})
#start to call randomF.r to run randomForest
r.library('randomForest')
r.source('randomF.r')
#rf_model still needs to be in pure R object
rf_model = r.randomF(known_data_frame, known_data[:,-1])

set_default_mode(BASIC_CONVERSION)
unknown_pred = r.predictRandomF(rf_model, unknown_data_frame)

rf_model= rf_model.as_py(BASIC_CONVERSION)
print rf_model.keys()
print rf_model['confusion']
Пример #9
0
	def lm_fit(self, lm_instance, go_no2prediction_space, bit_string, curs=None, lm_table=None):
		"""
		02-28-05
			linear model fitting here
		
		03-08-05
			grouping and accumulating before do linear model fitting, see log of 2005, 
			section 'linear model overfitting' for detail.
		03-27-05
			Use glm of R to do logistic regression
		06-30-05
			add cluster_size
			add bit_string to control which parameter should be enabled.
		07-04-05
			add connectivity_2nd
		07-06-05
			add logistic
		11-09-05 extend coeff_list and coeff_p_value_list
			restructure the list, go_no2lm_results[go_no]
			
			--data_prepare
			--submit
		"""
		sys.stderr.write("Linear Model Fitting...\n")
		go_no2lm_results = {}
		
		#06-30-05	setup the formula_list based on bit_string
		coeff_name_list = ['p_value', 'recurrence', 'connectivity', 'cluster_size', 'connectivity_2nd']
		formula_list = []
		for i in range(len(bit_string)):
			if bit_string[i] == '1':
				formula_list.append(coeff_name_list[i])
		
		for (go_no,data) in go_no2prediction_space.iteritems():
			sys.stderr.write("%s prediction entries from %s.\n"%(len(data), go_no))
			#11-09-05 extend coeff_list and coeff_p_value_list
			coeff_list = [0]*7	#intercept, p_value, recurrence, connectivity, cluster_size
			coeff_p_value_list = [1]*7
			index = 0	#06-30-05	the pointer for summary_stat
			
			if len(data)<=50:
				#two few data
				continue
			#convert it to a 2d array
			data = array(data)
			"""
			data_frame = r("d=data.frame(p_value=c(%s),recurrence=c(%s),connectivity=c(%s), is_correct=c(%s))"%(repr(list(data[:,0]))[1:-1], \
				repr(list(data[:,1]))[1:-1], repr(list(data[:,2]))[1:-1], repr(list(data[:,3]))[1:-1]))
			lm_result = r("lm_result=glm(is_correct~p_value+recurrence+connectivity, data=d,family=binomial)")
			significance_dict = r("summary(lm_result)")
			print significance_dict['coefficients']
			"""
			set_default_mode(NO_CONVERSION) #04-07-05
			data_frame = r.as_data_frame({"p_value":data[:,0], "recurrence":data[:,1], "connectivity":data[:,2], \
				"cluster_size":data[:,3], "connectivity_2nd":data[:,4], "is_correct":data[:,-1]})	#06-30-05	-1 denotes is_correct
			if self.logistic:
				lm_result = r.glm(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, family=r("binomial"))
			else:
				lm_result = r.glm(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame)	#06-30-05 use formula_list
			set_default_mode(BASIC_CONVERSION) #04-07-05
			#04-07-05 r.summary() requires lm_result in NO_CONVERSION state
			summary_stat = r.summary(lm_result)
			if self.debug:
				print "everything about coefficients from function", go_no, "is"
				print summary_stat['coefficients']	#p-values of coefficients
			"""
			#04-07-05 convert to python dictionary form
			lm_result = lm_result.as_py()
			coeff_list = [lm_result["coefficients"]["(Intercept)"], lm_result["coefficients"]["p_value"], \
				lm_result["coefficients"]["recurrence"], lm_result["coefficients"]["connectivity"], \
				lm_result["coefficients"]["cluster_size"], \
				summary_stat['coefficients'][0][-1], summary_stat['coefficients'][1][-1],\
				summary_stat['coefficients'][2][-1], summary_stat['coefficients'][3][-1],\
				summary_stat['coefficients'][4][-1], 1]
				#the last entry is score_cut_off, replaced later in get_score_cut_off()
				#06-30-05	add corresponding p-values
			"""
			#06-30-05	0 in summary_stat['coefficients'] is intercept
			coeff_list[0] = summary_stat['coefficients'][0][0]	#0 is the coefficient
			coeff_p_value_list[0] = summary_stat['coefficients'][0][-1]	#-1 is the corresponding p-value
			#06-30-05	fill in other efficients based on bit_string, NOTE i+1
			for i in range(len(bit_string)):
				if bit_string[i] == '1':
					index+=1
					coeff_list[i+1] = summary_stat['coefficients'][index][0]	#0 is the coefficient
					coeff_p_value_list[i+1] = summary_stat['coefficients'][index][-1]	#-1 is the corresponding p-value
			#11-09-05 restructure the following list
			go_no2lm_results[go_no] = [coeff_list, coeff_p_value_list, 1]	#the last entry is score_cut_off, replaced later in get_score_cut_off()
		sys.stderr.write("done.\n")
		return go_no2lm_results