def do_one_cv_classify_predeffolds_valid(theinput):
	c = theinput[0]
	gamma = theinput[1]
	nf = theinput[2]
	output = theinput[3]
	input = theinput[4]
	output_valid = theinput[5]
	input_valid = theinput[6]
	useprob = theinput[7]
	fold_start = theinput[8]
	fold_start_valid = theinput[9]
	perfmetric = theinput[10]
	
	param = svm.svm_parameter('-c %g -g %g -b %d' % (c,gamma,int(useprob)))

	prob = svm.svm_problem(output, input)
	fold_start_p = (c_int *len(fold_start))()
	for i in xrange(len(fold_start)):
		fold_start_p[i] = fold_start[i]
		
	prob_valid = svm.svm_problem(output_valid, input_valid)
	fold_start_p_valid = (c_int *len(fold_start_valid))()
	for i in xrange(len(fold_start_valid)):
		fold_start_p_valid[i] = fold_start_valid[i]


	target = (c_double * prob_valid.l)()
	posclass = output[0]
	
#	print prob
	libsvm.svm_cross_validation_sepsets(prob, prob_valid,fold_start_p, fold_start_p_valid,param, nf, target)

	
	ys = prob.y[:prob_valid.l]
	db = array([[ys[i],target[i]] for i in range(prob_valid.l)])
#	print db
	del target
	del fold_start_p
	del fold_start_p_valid
	
	neg = len([x for x in ys if x != posclass])
#	print neg
	pos = prob_valid.l-neg
#	print pos
		
#	print fb,neg,pos,posclass,perfmetric
	
	[topacc,topphi,minfpfnratio,topf1,auc,optbias] = optimize_results(db,neg,pos,posclass,perfmetric)
		
	return topacc,topphi,minfpfnratio,topf1,auc,optbias
Exemplo n.º 2
0
    def train(self,labels,data):
        '''
        Train the classifier.
        
        @param labels: A list of class labels.
        @param data: A 2D array or list of feature vectors.  One feature vector per row.
        '''
        
        # Check the types and convert to np arrays
        if isinstance(data,list) or isinstance(data,tuple):
            data = np.array(data,dtype=np.double)
            

        labels = np.array(labels,dtype=np.double)
            
        # Preprocess the data    
        labels,data = self._preprocessor.train(labels,data)
        labels,data = self._label_scale.train(labels,data)
        
        
        # Create the svm parameter data and problem description
        param = svm.svm_parameter(svm_type=svm.EPSILON_SVR,kernel_type = svm.RBF, p = self._epsilon, gamma=self._gamma)
        prob = svm.svm_problem(labels.tolist(),data.tolist())
        
        # train the svm
        self._model = svm.svm_model(prob, param)
def do_one_cv_classify_predeffolds_multi(theinput):
	c = theinput[0]
	gamma = theinput[1]
	nf = theinput[2]
	output = theinput[3]
	input = theinput[4]
	useprob = theinput[5]
	fold_start = theinput[6]
			
		
		
	param = svm.svm_parameter('-c %g -g %g -b %d' % (c,gamma,int(useprob)))
	
	prob = svm.svm_problem(output, input)
	target = (c_double * prob.l)()
	posclass = output[0]
	fold_start_p = (c_int *len(fold_start))()
	for i in xrange(len(fold_start)):
		fold_start_p[i] = fold_start[i]
	libsvm.svm_cross_validation_labeltargets(prob, fold_start_p,param, nf, target)

	acc = len([i for i in xrange(len(output)) if output[i] == target[i]])*1.0/prob.l
	del target
	del fold_start_p
	return acc
def build_problem(img_kind, subdir = "data/"):
	subdir = "data/"

	classes = []
	data = []

	the_ones = glob.glob(subdir + "f_" + img_kind + "*.jpg")
	all_of_them = glob.glob(subdir + "f_*_*.jpg")
	the_others = []

	for x in all_of_them:
		if the_ones.count(x) < 1:
			the_others.append(x)
	
	for x in the_ones:
		classes.append(1)
		data.append(get_image_features(cv.LoadImageM(x), True, img_kind))
	
	for x in the_others:
		classes.append(-1)
		data.append(get_image_features(cv.LoadImageM(x), True, img_kind))

	prob = svm.svm_problem(classes, data)

	return prob
Exemplo n.º 5
0
    def train(self, c, g, probability=True, compensation=True,
              path=None, filename=None, save=True):
        if filename is None:
            filename = os.path.splitext(self.getOption('strArffFileName'))[0]
            filename += '.model'
        if path is None:
            path = self.dctEnvPaths['data']
        param = svm.svm_parameter(kernel_type=svm.RBF,
                                  C=c, gamma=g,
                                  probability=1 if probability else 0)

        labels, samples = self.getData(normalize=True)

        # because we train the SVM with dict we need to redefine the zero-insert
        self.hasZeroInsert = False
        if not self.oClassifier is None:
            self.oClassifier.setOption('hasZeroInsert', True)

        if compensation:
            weight, weight_label = self._calculateCompensation(labels)
            param.weight = weight
            param.weight_label = weight_label
            param.nr_weight = len(weight)

        problem = svm.svm_problem(labels, samples)
        model = svm.svm_model(problem, param)
        if save:
            model.save(os.path.join(path, filename))
        return problem, model
Exemplo n.º 6
0
  def generate_model(self, variant_name, models_folder):
    training_file = variant_name + ".t"
    if self.feature_scaling:
      self.scale_features(variant_name, models_folder)
      training_file += ".scale"
    (y, x) = svm_read_problem(training_file)
    self.m_prob = svm.svm_problem(y, x, self.m_params.kernel_type == PRECOMPUTED)

    libsvm_path = os.environ['LIBSVM_PATH']
    scaled_filename = os.path.abspath(training_file)
    cp = "python grid.py " + scaled_filename
    curdir = os.getcwd()
    os.chdir(libsvm_path + "/tools/")
    result = call_process(cp)
    os.chdir(curdir)
    C,g,rate = [float(l) for l in result.split("\n")[-2].split(" ")]

    print "C: %.8f, gamma: %.8f\n" % (C,g)

    self.m_params.C = C
    self.m_params.gamma = g

    print "\n-----------------------------"
    model = svm.svm_train(self.m_prob, self.m_params)
    print "-----------------------------\n"

    svm_save_model(models_folder + variant_name + ".model", model)
def do_one_cv_classify(theinput):
	c = theinput[0]
	gamma = theinput[1]
	nf = theinput[2]
	output = theinput[3]
	input = theinput[4]
	useprob = theinput[5]	
	perfmetric = theinput[6]

	param = svm.svm_parameter('-c %g -g %g -b %d' % (c,gamma,int(useprob)))

	prob = svm.svm_problem(output, input)
	target = (c_double * prob.l)()
	
	posclass = output[0]
	fold_start = (c_int *1)();
	fold_start[0] = -1;
	libsvm.svm_cross_validation(prob, fold_start, param, nf, target)
	ys = prob.y[:prob.l]
	db = array([[ys[i],target[i]] for i in range(prob.l)])
	
	del target
	
	neg = len([x for x in ys if x != posclass])
	pos = prob.l-neg
	
	
	
	[topacc,topphi,minfpfnratio,topf1,auc,optbias] = optimize_results(db,neg,pos,posval,perfmetric)
		
	return topacc,topphi,minfpfnratio,topf1,auc,optbias
Exemplo n.º 8
0
 def __init__(self, data_dictionary, model_target, kernel=LINEAR, cv_segments=10, **args):
     #Create an SVM model object
 
     #Check to see if a threshold has been specified in the function's arguments
     try: self.threshold = args['threshold']
     except KeyError: self.threshold=2.3711   # if there is no 'threshold' key, then use the default (2.3711)
     
     #Store some object data
     model_dict = deepcopy(data_dictionary)
     self.model_target = model_target
     self.folds = cv_segments
            
     #Label the exceedances in the training set.
     model_dict[model_target] = self.Assign_Labels(model_dict[model_target])
     
     #Extract the training labels and training set
     self.training_labels = model_dict.pop(model_target)
     self.training_set = np.transpose(model_dict.values())
     self.headers = model_dict.keys()
             
     #Scale the covariates to [-1,1]
     self.Scale_Covariates()
     
     #Generate an SVM model.
     self.svm_problem = svm.svm_problem(self.training_labels, self.training_set)
     self.svm_params = {'kernel_type' : kernel, 'weight_label' : [0,1], 'weight' : [10,1]}
     self.model=svm.svm_model(self.svm_problem, svm.svm_parameter(**self.svm_params))
     
     #Use cross-validation to find the best number of components in the model.
     self.Select_Linear_Model(-5, 10)
     
     #Rebuild the model, calculating the probabilities of class membership
     self.svm_params['probability']=1
     self.model=svm.svm_model(self.svm_problem, svm.svm_parameter(**self.svm_params))
def svm(y,K,**param_kw):
    """
    Solve the SVM problem. Return ``(alpha, b)``

    `y`
      labels
    `K`
      precopmuted kernel matrix

    Additional keyword arguments are passed on as svm parameters to
    the model.

    The wrapper is needed to precondition the precomputed matrix for
    use with libsvm, and to extract the model parameters and convert
    them into the canonical weight vector plus scalar offset. Normally
    libsvm hides these model paramters, preferring instead to provide
    a high-level model object that can be queried for results.

    """
    i = arange(1,len(K)+1).reshape((-1,1))
    X = hstack((i, K))
    y = asarray(y,dtype=double)
    X = asarray(X,dtype=double)
    prob = svm_problem(y,X)
    param = svm_parameter(kernel_type=PRECOMPUTED,**param_kw)
    model = svm_model(prob, param)
    return get_alpha_b(model)
Exemplo n.º 10
0
def iqr_model_train(matrix_kernel_train, labels_train, idx2clipid,
                    svm_para = '-w1 50 -t 4 -b 1 -c 1'):
    """
    Light-weighted SVM learning module for online IQR

    @param matrix_kernel_train: n-by-n square numpy array with kernel values
        between training data
    @param labels_train: row-wise labels of training data (1 or True indicates
        positive, 0 or False otherwise
    @param idx2clipid: idx2clipid(row_idx) returns the clipid for the 0-base row
        in matrix
    @param svm_para: (optional) SVM learning parameter

    @rtype: dictionary with 'clipids_SV': list of clipids for support vectors
    @return: output as a dictionary with 'clipids_SV'

    """
    log = logging.getLogger('iqr_model_train')

    # set training inputs
    matrix_kernel_train = np.vstack((np.arange(1, len(matrix_kernel_train)+1),
                                     matrix_kernel_train)).T
    log.debug("Done matrix_kernel_train")

    problem = svm.svm_problem(labels_train.tolist(), matrix_kernel_train.tolist(), isKernel=True)
    log.debug("Done problem")
    svm_param = svm.svm_parameter(svm_para)
    log.debug("Done svm_param")

    # train model
    model = svmutil.svm_train(problem, svm_param)
    log.debug("Done train model")

    # release memory
    del problem
    del svm_param
    log.debug("Done release memory")

    # check learning failure
    if model.l == 0:
        raise Exception('svm model learning failure')
    log.debug("Done checking learning failure (no failure)")

    n_SVs = model.l
    clipids_SVs = []
    idxs_train_SVs = svmtools.get_SV_idxs_nonlinear_svm(model)
    for i in range(n_SVs):
        _idx_1base = idxs_train_SVs[i]
        _idx_0base = _idx_1base - 1
        clipids_SVs.append(idx2clipid[_idx_0base])
        model.SV[i][0].value = i+1 # within SVM model, index needs to be 1-base
    log.debug("Done collecting support vector IDs")

    #svmutil.svm_save_model(filepath_model, model)

    output = dict()
    output['model'] = model
    output['clipids_SVs'] = clipids_SVs

    return output
Exemplo n.º 11
0
    def _test_evaluation(self, allow_slow):
        """
        Test that the same predictions are made
        """
        from svm import svm_parameter, svm_problem
        from svmutil import svm_train, svm_predict

        # Generate some smallish (poly kernels take too long on anything else) random data
        x, y = [], []
        for _ in range(50):
            cur_x1, cur_x2 = random.gauss(2, 3), random.gauss(-1, 2)
            x.append([cur_x1, cur_x2])
            y.append(1 + 2 * cur_x1 + 3 * cur_x2)

        input_names = ["x1", "x2"]
        df = pd.DataFrame(x, columns=input_names)
        prob = svm_problem(y, x)

        # Parameters
        base_param = "-s 3"  # model type is epsilon SVR
        non_kernel_parameters = [
            "", "-c 1.5 -p 0.5 -h 1", "-c 0.5 -p 0.5 -h 0"
        ]
        kernel_parameters = [
            "",
            "-t 2 -g 1.2",  # rbf kernel
            "-t 0",  # linear kernel
            "-t 1",
            "-t 1 -d 2",
            "-t 1 -g 0.75",
            "-t 1 -d 0 -g 0.9 -r 2",  # poly kernel
            "-t 3",
            "-t 3 -g 1.3",
            "-t 3 -r 0.8",
            "-t 3 -r 0.8 -g 0.5",  # sigmoid kernel
        ]

        for param1 in non_kernel_parameters:
            for param2 in kernel_parameters:
                param_str = " ".join([base_param, param1, param2])
                print(param_str)
                param = svm_parameter(param_str)

                model = svm_train(prob, param)
                (df["prediction"], _, _) = svm_predict(y, x, model)

                spec = libsvm.convert(model,
                                      input_names=input_names,
                                      target_name="target")

                if _is_macos() and _macos_version() >= (10, 13):
                    metrics = evaluate_regressor(spec, df)
                    self.assertAlmostEquals(metrics["max_error"], 0)

                if not allow_slow:
                    break

            if not allow_slow:
                break
Exemplo n.º 12
0
 def train(self, session, doc):
     # doc here is [[class,...], [{vector},...]]
     (labels, vectors) = doc.get_raw(session)
     problem = svm.svm_problem(labels, vectors)
     self.model = svm.svm_model(problem, self.param)
     modelPath = self.get_path(session, 'modelPath')
     self.model.save(str(modelPath))
     self.predicting = 1
Exemplo n.º 13
0
 def train(self, session, doc):
     # doc here is [[class,...], [{vector},...]]
     (labels, vectors) = doc.get_raw(session)
     problem = svm.svm_problem(labels, vectors)
     self.model = svm.svm_model(problem, self.param)
     modelPath = self.get_path(session, 'modelPath')
     self.model.save(str(modelPath))
     self.predicting = 1
Exemplo n.º 14
0
def trainSVM(kernel, labels):
    #need to add an id number as the first column of the list
    svmKernel = column_stack((arange(1, len(kernel.tolist()) + 1), kernel))
    prob = svm_problem(labels.tolist(), svmKernel.tolist(), isKernel=True)
    param = svm_parameter('-t 4')   

    model = svm_train(prob, param)
    return model
Exemplo n.º 15
0
    def _test_evaluation(self, allow_slow):
        """
        Test that the same predictions are made
        """
        from svm import svm_parameter, svm_problem
        from svmutil import svm_train, svm_predict

        # Generate some smallish (poly kernels take too long on anything else) random data
        x, y = [], []
        for _ in range(50):
            cur_x1, cur_x2 = random.gauss(2, 3), random.gauss(-1, 2)
            x.append([cur_x1, cur_x2])
            y.append(1 + 2 * cur_x1 + 3 * cur_x2)

        input_names = ['x1', 'x2']
        df = pd.DataFrame(x, columns=input_names)
        prob = svm_problem(y, x)

        # Parameters
        base_param = '-s 3'  # model type is epsilon SVR
        non_kernel_parameters = [
            '', '-c 1.5 -p 0.5 -h 1', '-c 0.5 -p 0.5 -h 0'
        ]
        kernel_parameters = [
            '',
            '-t 2 -g 1.2',  # rbf kernel
            '-t 0',  # linear kernel
            '-t 1',
            '-t 1 -d 2',
            '-t 1 -g 0.75',
            '-t 1 -d 0 -g 0.9 -r 2',  # poly kernel
            '-t 3',
            '-t 3 -g 1.3',
            '-t 3 -r 0.8',
            '-t 3 -r 0.8 -g 0.5'  # sigmoid kernel
        ]

        for param1 in non_kernel_parameters:
            for param2 in kernel_parameters:
                param_str = ' '.join([base_param, param1, param2])
                print(param_str)
                param = svm_parameter(param_str)

                model = svm_train(prob, param)
                (df['prediction'], _, _) = svm_predict(y, x, model)

                spec = libsvm.convert(model,
                                      input_names=input_names,
                                      target_name='target')

                metrics = evaluate_regressor(spec, df)
                self.assertAlmostEquals(metrics['max_error'], 0)

                if not allow_slow:
                    break

            if not allow_slow:
                break
Exemplo n.º 16
0
 def train(self, dataset):
     """
     Trains the svm classifier. Converts words to real numbers for training
     as SVM expects only numbers.
     """
     super(SvmLearner, self).train(dataset)
     prob  = svm.svm_problem(self.results, self.observations)
     param = svm.svm_parameter(kernel_type=svm.LINEAR, C=10, probability=1)
     self.model = svm.svm_model(prob, param)
Exemplo n.º 17
0
    def train(self,trainset):
        """
        Trains the SVM.
        """

        self.n_classes = len(trainset.metadata['targets'])

        # Set LIBSVM parameters
        kernel_types = {'linear':libsvm.LINEAR,'polynomial':libsvm.POLY,
                        'rbf':libsvm.RBF,'sigmoid':libsvm.SIGMOID}
        if self.kernel not in kernel_types:
            raise ValueError('Invalid kernel: '+self.kernel+'. Should be either \'linear\', \'polynomial\', \'rbf\' or \'sigmoid\'')

        if self.label_weights != None:
            class_to_id = trainset.metadata['class_to_id']
            nr_weight = self.n_classes
            weight_label = range(self.n_classes)
            weight = [1]*self.n_classes
            for k,v in self.label_weights.iteritems():
                weight[class_to_id[k]] = v
        else:
            nr_weight = 0
            weight_label = []
            weight = []

        libsvm_params = libsvm.svm_parameter(svm_type = libsvm.C_SVC,
                                             kernel_type = kernel_types[self.kernel],
                                             degree=self.degree,
                                             gamma=self.gamma,
                                             coef0=self.coef0,
                                             C=self.C,
                                             probability=int(self.output_probabilities),
                                             cache_size=self.cache_size,
                                             eps=self.tolerance,
                                             shrinking=int(self.shrinking),
                                             nr_weight = nr_weight,
                                             weight_label = weight_label,
                                             weight = weight)
        

        # Put training set in the appropriate format:
        #  if is sparse (i.e. a pair), inputs are converted to dictionaries
        #  if not, inputs are assumed to be sequences and are kept intact
        libsvm_inputs = []
        libsvm_targets = []
        for input,target in trainset:
            if type(input) == tuple:
                libsvm_inputs += [dict(zip(input[1],input[0]))]
            else:
                libsvm_inputs += [input]
            libsvm_targets += [float(target)] # LIBSVM requires double-valued targets

        libsvm_problem = libsvm.svm_problem(libsvm_targets,libsvm_inputs)

        # Train SVM
        self.svm = libsvm.svm_model(libsvm_problem,libsvm_params)
Exemplo n.º 18
0
def leave_one_out(y, x, param, n='DUMMY'):
    results = []
    for i, test in enumerate(zip(y, x)):
        training_y = y[:i] + y[i+1:]
        training_x = x[:i] + x[i+1:]
        problem = svm.svm_problem(training_y, training_x)
        model = svmutil.svm_train(problem, param, '-q')
        result = svmutil.svm_predict(y[i:i+1], x[i:i+1], model, '-b 1')
        results.append(result + (test[0], make_d.decode(x[i], make_d.decode_dic)))
    return results
Exemplo n.º 19
0
    def iterGridSearchSVM(self,
                          c_info=None,
                          g_info=None,
                          fold=5,
                          probability=False,
                          compensation=True):
        swap = lambda a, b: (b, a)
        if not c_info is None and len(c_info) >= 3:
            c_begin, c_end, c_step = c_info[:3]
        else:
            c_begin, c_end, c_step = -5, 15, 2
        if c_end < c_begin:
            c_begin, c_end = swap(c_begin, c_end)
        c_step = abs(c_step)

        if not g_info is None and len(g_info) >= 3:
            g_begin, g_end, g_step = g_info[:3]
        else:
            g_begin, g_end, g_step = -15, 3, 2
        if g_end < g_begin:
            g_begin, g_end = swap(g_begin, g_end)
        g_step = abs(g_step)

        labels, samples = self.getData(normalize=True)
        problem = svm.svm_problem(labels, samples)

        if compensation:
            weight, weight_label = self._calculateCompensation(labels)

        n = (c_end - c_begin) / c_step + 1
        n *= (g_end - g_begin) / g_step + 1

        l2c = c_begin
        while l2c <= c_end:
            l2g = g_begin
            while l2g <= g_end:

                param = svm.svm_parameter(kernel_type=svm.RBF,
                                          C=2.**l2c,
                                          gamma=2.**l2g,
                                          probability=1 if probability else 0)
                if compensation:
                    param.weight = weight
                    param.weight_label = weight_label
                    param.nr_weight = len(weight)

                predictions = svm.cross_validation(problem, param, fold)
                predictions = map(int, predictions)

                conf = ConfusionMatrix.from_lists(labels, predictions,
                                                  self.class_names.keys())
                yield n, l2c, l2g, conf

                l2g += g_step
            l2c += c_step
Exemplo n.º 20
0
def do_one_cv_classify_valid(theinput):
	c = theinput[0]
	gamma = theinput[1]
	nf = theinput[2]
	output = theinput[3]
	input = theinput[4]
	output_valid = theinput[5]
	input_valid = theinput[6]
	useprob = theinput[7]	
	perfmetric = theinput[8]

	param = svm.svm_parameter('-c %g -g %g -b %d' % (c,gamma,int(useprob)))

	prob = svm.svm_problem(output, input)
	
	prob_valid = svm.svm_problem(output_valid, input_valid)

	target = (c_double * prob_valid.l)()

	posclass = output[0]
	fold_start = (c_int *1)();
	fold_start[0] = -1;
	
	fold_start_valid = (c_int *1)();
	fold_start_valid[0] = -1;
	
	libsvm.svm_cross_validation_sepsets(prob, prob_valid,fold_start,fold_start_valid, param, nf, target)
	
	ys = prob.y[:prob_valid.l]
	db = array([[ys[i],target[i]] for i in range(prob_valid.l)])
	
	del target
	
	neg = len([x for x in ys if x != posclass])
	pos = prob_valid.l-neg
	
	
	
	[topacc,topphi,minfpfnratio,topf1,auc,optbias] = optimize_results(db,neg,pos,posval,perfmetric)
		
	return topacc,topphi,minfpfnratio,topf1,auc,optbias
Exemplo n.º 21
0
	def trainmodel(self,train,cv,test,modelsavepath):
		y,x = svmutil.svm_read_problem(train)#读入训练数据
		# ycv,xcv = svm_read_problem(cv)#读入验证集
		# ytest,xtest=svm_read_problem(test)#读入测试集
		prob  = svm.svm_problem(y, x)
		param = svm.svm_parameter('-t 2 -c 0.5 -g 0.125 -b 1')		
		model = svmutil.svm_train(prob, param)				
		yt,xt = svmutil.svm_read_problem(train)#???????????
		p_labs, p_acc, p_vals = svmutil.svm_predict(yt, xt, model,'-b 1')
		svmutil.svm_save_model(modelsavepath, model)#save model
		# model = svmutil.svm_load_model('model_file')#读取model
		pass
Exemplo n.º 22
0
    def iterGridSearchSVM(self, c_info=None, g_info=None, fold=5,
                          probability=False, compensation=True):
        swap = lambda a,b: (b,a)
        if not c_info is None and len(c_info) >= 3:
            c_begin, c_end, c_step = c_info[:3]
        else:
            c_begin, c_end, c_step = -5,  15, 2
        if c_end < c_begin:
            c_begin, c_end = swap(c_begin, c_end)
        c_step = abs(c_step)

        if not g_info is None and len(g_info) >= 3:
            g_begin, g_end, g_step = g_info[:3]
        else:
            g_begin, g_end, g_step = -15, 3, 2
        if g_end < g_begin:
            g_begin, g_end = swap(g_begin, g_end)
        g_step = abs(g_step)

        labels, samples = self.getData(normalize=True)
        #print len(labels), len(samples)
        problem = svm.svm_problem(labels, samples)

        if compensation:
            weight, weight_label = self._calculateCompensation(labels)

        n = (c_end - c_begin) / c_step + 1
        n *= (g_end - g_begin) / g_step + 1

        l2c = c_begin
        while l2c <= c_end:
            l2g = g_begin
            while l2g <= g_end:

                param = svm.svm_parameter(kernel_type=svm.RBF,
                                          C=2.**l2c, gamma=2.**l2g,
                                          probability=1 if probability else 0)
                if compensation:
                    param.weight = weight
                    param.weight_label = weight_label
                    param.nr_weight = len(weight)

                predictions = svm.cross_validation(problem, param, fold)
                predictions = map(int, predictions)

                #print n,c,g
                conf = ConfusionMatrix.from_lists(labels, predictions,
                                                  self.l2nl)
                yield n,l2c,l2g,conf

                l2g += g_step
            l2c += c_step
Exemplo n.º 23
0
 def learnModel(self, train_y, train_X):
     # scale train data
     svmScaler = preprocessing.MinMaxScaler(feature_range = (-1, 1))
     train_X_scaledArr = svmScaler.fit_transform(train_X)
     
     # learn and save svm model
     X = train_X_scaledArr.tolist()   
     problem = svm_problem(train_y, X)
     paramStr = '-c ' + str(self._param_c) + ' -g ' + str(self._param_g) + ' -q'
     param = svm_parameter(paramStr)
     
     self._model = svm_train(problem, param)
     self._scaler = svmScaler
Exemplo n.º 24
0
def leave_one_out(y, x, param, n="DUMMY"):
    results = []
    for i, test in enumerate(zip(y, x)):
        training_y = y[:i] + y[i + 1 :]
        training_x = x[:i] + x[i + 1 :]
        problem = svm.svm_problem(training_y, training_x)
        # t0 = time.clock()
        model = svmutil.svm_train(problem, param, "-q")
        # t1 = time.clock()
        # print 'Training took', t1 - t0, 'seconds.'
        result = svmutil.svm_predict(y[i : i + 1], x[i : i + 1], model, "-b 1")
        results.append(result + (test[0], make_d.decode(x[i], make_d.decode_dic)))
    return results
Exemplo n.º 25
0
    def lib_svm(self, train_file, test_file, digit0, digit1):
        features, labels = self.get_data(train_file, digit0, digit1)
        training_data = svm_problem(labels, features)

        if (self.kernel == 'gaussian'):
            params = svm_parameter('-s 0 -t 2 -c 1 -g 0.05')
        else:
            params = svm_parameter('-s 0 -t 2 -c 1 -g 0.001275')

        model = svm_train(training_data, params)

        test_features, test_labels = self.get_data(test_file, digit0, digit1)
        p_labels, p_acc, p_vals = svm_predict(test_labels, test_features,
                                              model)
    def train_test_svm(self):
        logging.debug("TRAINING Samples: " + str(len(self._running_X_train)))
        logging.debug("TESTING Samples: " + str(len(self._running_X_test)))

        svm_problem = svm.svm_problem(self._running_y_train,
                                      self._running_X_train)
        self.svm_model = svmutil.svm_train(svm_problem, self.svm_param)
        predicted_labels, predicted_mse, predicted_probs = \
          svmutil.svm_predict(self._running_y_test,
                              self._running_X_test, self.svm_model, "-b 1")

        res = sklearn.metrics.accuracy_score(self._running_y_test,
                                             predicted_labels)
        self.refinement_results.append(res)
        print("RESULT: " + str(res * 100))
def lib_svm(train_file, test_file, kernel):
    print("inside libsvm")
    features, labels = get_data_from_csv(train_file)
    print(features)

    training_data = svm_problem(labels, features)

    if (kernel == 'gaussian'):
        params = svm_parameter('-s 0 -t 2 -c 1 -g 0.05')
    else:
        params = svm_parameter('-s 0 -t 2 -c 1 -g 0.001275')

    model = svm_train(training_data, params)

    test_features, test_labels = get_data_from_csv(test_file)
    p_labels, p_acc, p_vals = svm_predict(test_labels, test_features, model)
    return p_labels, p_acc, p_vals
Exemplo n.º 28
0
def test(word, documents):
    import svm, random
    docs = [d.copy() for d in documents if d[reverse_map[word]]]
    nondocs = [d.copy() for d in documents if not d[reverse_map[word]]]
    nondocs = random.sample(nondocs, min(5 * len(docs), len(nondocs)))
    print float(len(nondocs)) / (len(docs) + len(nondocs))
    cats = [1 for i in docs] + [0 for i in nondocs]
    obs = docs + nondocs
    for i in xrange(len(obs)):
        obs[i][reverse_map[word]] = 0.
    zobs = zip(obs, cats)
    random.shuffle(zobs)
    obs, cats = zip(*zobs)
    params = svm.svm_parameter(C=1, kernel_type=svm.LINEAR)
    problem = svm.svm_problem(cats, obs)
    target = svm.cross_validation(problem, params, 20)
    return sum(target[i] == cats[i] for i in cats) / float(len(cats))
Exemplo n.º 29
0
def test(word, documents):
    import svm,random
    docs = [d.copy() for d in documents if d[reverse_map[word]]]
    nondocs = [d.copy() for d in documents if not d[reverse_map[word]]]
    nondocs = random.sample(nondocs,min(5*len(docs),len(nondocs)))
    print float(len(nondocs))/(len(docs)+len(nondocs))
    cats = [1 for i in docs] + [0 for i in nondocs]
    obs = docs + nondocs
    for i in xrange(len(obs)):
        obs[i][reverse_map[word]] = 0.
    zobs = zip(obs,cats)
    random.shuffle(zobs)
    obs,cats = zip(*zobs)
    params = svm.svm_parameter(C=1, kernel_type=svm.LINEAR)
    problem = svm.svm_problem(cats,obs)
    target = svm.cross_validation(problem,params,20)
    return sum(target[i] == cats[i] for i in cats)/float(len(cats))
Exemplo n.º 30
0
def do_one_cv(theinput):
	nu = theinput[0]
	c = theinput[1]
	gamma = theinput[2]
	nf = theinput[3]
	output = theinput[4]
	input = theinput[5]
	bins = theinput[6]
	
	param = svm.svm_parameter('-s %d -t %d -n %g -c %g -g %g' % (svm.NU_SVR,svm.RBF,nu,c,gamma))

	prob = svm.svm_problem(output, input)
	target = (c_double * prob.l)()
	fold_start = (c_int *1)();
	fold_start[0] = -1;
	
	libsvm.svm_cross_validation_labeltargets(prob, fold_start,param, nf, target)	
	MSE,SCC = evaluations(prob.y[:prob.l],target[:prob.l],bins)
	del target
	return MSE,SCC
Exemplo n.º 31
0
def do_one_cv_classify_predeffolds(theinput):
	c = theinput[0]
	gamma = theinput[1]
	nf = theinput[2]
	output = theinput[3]
	input = theinput[4]
	useprob = theinput[5]
	fold_start = theinput[6]
	perfmetric = theinput[7]
	
	param = svm.svm_parameter('-c %g -g %g -b %d' % (c,gamma,int(useprob)))

	prob = svm.svm_problem(output, input)
	fold_start_p = (c_int *len(fold_start))()
	for i in xrange(len(fold_start)):
		fold_start_p[i] = fold_start[i]
		
	target = (c_double * prob.l)()
	posclass = output[0]
	
#	print prob
	libsvm.svm_cross_validation(prob, fold_start_p, param, nf, target)

	
	ys = prob.y[:prob.l]
	db = array([[ys[i],target[i]] for i in range(prob.l)])
#	print db
	del target
	del fold_start_p
	
	neg = len([x for x in ys if x != posclass])
#	print neg
	pos = prob.l-neg
#	print pos
		
#	print fb,neg,pos,posclass,perfmetric
	
	[topacc,topphi,minfpfnratio,topf1,auc,optbias] = optimize_results(db,neg,pos,posclass,perfmetric)
		
	return topacc,topphi,minfpfnratio,topf1,auc,optbias
Exemplo n.º 32
0
    def train(self, search=False, **kwargs):
        """ Train the SVM on the dataset. For RBF kernels (the default), an optional meta-parameter search can be performed.

        :key search: optional name of grid search class to use for RBF kernels: 'GridSearch' or 'GridSearchDOE' 
        :key log2g: base 2 log of the RBF width parameter
        :key log2C: base 2 log of the slack parameter
        :key searchlog: filename into which to dump the search log
        :key others: ...are passed through to the grid search and/or libsvm 
        """
        
        self.setParams(**kwargs)
        problem = svm_problem(self.ds['target'].flatten(), self.ds['input'].tolist())
        if search:
            # this is a bit of a hack...
            model = eval(search + "(problem, self.svmtarget, cmin=[0,-7],cmax=[25,1], cstep=[0.5,0.2],plotflag=self.plot,searchlog=self.searchlog,**self.params)")
        else:
            param = svm_parameter(**self.params)
            model = svm_model(problem, param)
            logging.info("Training completed with parameters:")
            logging.info(repr(param))

        self.svm.setModel(model)
Exemplo n.º 33
0
def do_one_cv_classify_multi(theinput):
	c = theinput[0]
	gamma = theinput[1]
	nf = theinput[2]
	output = theinput[3]
	input = theinput[4]
	useprob = theinput[5]
		
		
		
	param = svm.svm_parameter('-c %g -g %g -b %d' % (c,gamma,int(useprob)))
	
	prob = svm.svm_problem(output, input)
	target = (c_double * prob.l)()
	posclass = output[0]
	fold_start = (c_int *1)();
	fold_start[0] = -1;
	libsvm.svm_cross_validation_labeltargets(prob, fold_start,param, nf, target)

	
	acc = len([i for i in xrange(len(output)) if output[i] == target[i]])*1.0/prob.l
	return acc
Exemplo n.º 34
0
    def train(self, search=False, **kwargs):
        """ Train the SVM on the dataset. For RBF kernels (the default), an optional meta-parameter search can be performed.

        :key search: optional name of grid search class to use for RBF kernels: 'GridSearch' or 'GridSearchDOE' 
        :key log2g: base 2 log of the RBF width parameter
        :key log2C: base 2 log of the slack parameter
        :key searchlog: filename into which to dump the search log
        :key others: ...are passed through to the grid search and/or libsvm 
        """
        
        self.setParams(**kwargs)
        problem = svm_problem(self.ds['target'].flatten(), self.ds['input'].tolist())
        if search:
            # this is a bit of a hack...
            model = eval(search + "(problem, self.svmtarget, cmin=[0,-7],cmax=[25,1], cstep=[0.5,0.2],plotflag=self.plot,searchlog=self.searchlog,**self.params)")
        else:
            param = svm_parameter(**self.params)
            model = svm_model(problem, param)
            logging.info("Training completed with parameters:")
            logging.info(repr(param))

        self.svm.setModel(model)
Exemplo n.º 35
0
def bench_svm(X, Y, T):
    """
    bench with swig-generated wrappers that come with libsvm
    """

    import svm

    X1 = X.tolist()
    Y1 = Y.tolist()
    T1 = T.tolist()

    gc.collect()

    # start time
    tstart = datetime.now()
    problem = svm.svm_problem(Y1, X1)
    param = svm.svm_parameter(svm_type=0, kernel_type=0)
    model = svm.svm_model(problem, param)
    for i in T.tolist():
        model.predict(i)
    delta = (datetime.now() - tstart)
    # stop time
    svm_results.append(delta.seconds + delta.microseconds/mu_second)
Exemplo n.º 36
0
 def train(self, examples, parameters=None):
     self.isBinary = self.isBinaryProblem(examples)
     examples = self.filterTrainingSet(examples)
     ExampleUtils.writeExamples(examples, self.tempDir+"/train.dat")
     #prepare parameters:
     if parameters.has_key("c"):
         assert(not parameters.has_key("C"))
         parameters["C"] = parameters["c"]
         del parameters["c"]
     totalExamples = float(sum(self.classes.values()))
     weight_label = self.classes.keys()
     weight_label.sort()
     weight = []
     for k in weight_label:
         weight.append(1.0-self.classes[k]/totalExamples)
     libSVMparam = svm.svm_parameter(nr_weight = len(self.classes), weight_label=weight_label, weight=weight, **parameters)
     labels = []
     samples = []
     for example in examples:
         labels.append(example[1])
         samples.append(example[2])
     problem = svm.svm_problem(labels, samples)
     self.model = svm.svm_model(problem, libSVMparam)
Exemplo n.º 37
0
def do_training(classifier_name, train_x, train_y, test_x, test_y):
    model_save_file = str('./models/') + classifier_name + str('.model')
    if classifier_name == 'LIBSVM':
        prob = svm_problem(
            np.array(train_y).tolist(),
            np.array(train_x).tolist())
        param = svm_parameter('-s 1 -t 1 -q -d 3')
        # param = svm_parameter('-t 2 -q')
        model = svm_train(prob, param)
        svm_save_model('./models/{}.model'.format(classifier_name), model)
        svm_predict(
            np.array(test_y).tolist(),
            np.array(test_x).tolist(), model)
        return model

    model_save = {}
    classifiers = {
        'NB': naive_bayes_classifier,
        'KNN': knn_classifier,
        'LR': logistic_regression_classifier,
        'RF': random_forest_classifier,
        'DT': decision_tree_classifier,
        'SVM': svm_classifier,
        'SVMCV': svm_cross_validation,
        'GBDT': gradient_boosting_classifier,
        'ADA': ada_boosting_classifier,
        'MLP': mlp_classifier,
        'XGBOOST': xgboost_classifier
    }
    model = classifiers[classifier_name](train_x, train_y)
    model_save[classifier_name] = model
    predict = model.predict(test_x)
    accuracy = metrics.accuracy_score(test_y, predict)
    print('accuracy: %.2f%%' % (100 * accuracy))
    jl.dump(model_save, model_save_file)
    return model
Exemplo n.º 38
0
def train(request):

    points = models.Point2d.objects.all()

    # Storing the information to be presented to SVM
    labels = []
    inputs = []

    # For each point, store the information into arrays
    for p in points:
        labels.append(p.label)
        inputs.append([p.x, p.y])

    prob = svm.svm_problem(labels, inputs)
    param = svm.svm_parameter('-t 2 -c 100')
    model = svmutil.svm_train(prob, param)

    try:
        svmutil.svm_save_model('libsvm.model', model)
    except Exception as e:
        print "error: ", e, "\n"

    data = {"status": "trained"}
    return json(data)
Exemplo n.º 39
0
def train(request):
    
    points = models.Point2d.objects.all()
    
    # Storing the information to be presented to SVM
    labels = []
    inputs = []
    
    # For each point, store the information into arrays
    for p in points:
        labels.append( p.label )
        inputs.append([p.x, p.y])
    
    prob = svm.svm_problem(labels, inputs)
    param = svm.svm_parameter('-t 2 -c 100')
    model = svmutil.svm_train(prob, param)
    
    try:
        svmutil.svm_save_model('libsvm.model', model)
    except Exception as e:
        print "error: ", e, "\n"
    
    data = {"status": "trained"}
    return json(data)
Exemplo n.º 40
0
    def train(self,
              c,
              g,
              probability=True,
              compensation=True,
              path=None,
              filename=None,
              save=True):
        if filename is None:
            filename = splitext(self.arff_file)[0]
            filename += '.model'
        if path is None:
            path = self.data_dir
        param = svm.svm_parameter(kernel_type=svm.RBF,
                                  C=c,
                                  gamma=g,
                                  probability=1 if probability else 0)

        labels, samples = self.getData(normalize=True)

        # because we train the SVM with dict we need to redefine the zero-insert
        self.has_zero_insert = False
        if not self.classifier is None:
            self.classifier.setOption('hasZeroInsert', True)

        if compensation:
            weight, weight_label = self._calculateCompensation(labels)
            param.weight = weight
            param.weight_label = weight_label
            param.nr_weight = len(weight)

        problem = svm.svm_problem(labels, samples)
        model = svm.svm_model(problem, param)
        if save:
            model.save(os.path.join(path, filename))
        return problem, model
Exemplo n.º 41
0
    def rank(self, pos, neg):
        """
        Rank the currently indexed elements given ``pos`` positive and ``neg``
        negative exemplar descriptor elements.

        :param pos: Iterable of positive exemplar DescriptorElement instances.
            This may be optional for some implementations.
        :type pos: collections.Iterable[smqtk.representation.DescriptorElement]

        :param neg: Iterable of negative exemplar DescriptorElement instances.
            This may be optional for some implementations.
        :type neg: collections.Iterable[smqtk.representation.DescriptorElement]

        :return: Map of indexed descriptor elements to a rank value between
            [0, 1] (inclusive) range, where a 1.0 means most relevant and 0.0
            meaning least relevant.
        :rtype: dict[smqtk.representation.DescriptorElement, float]

        """
        # Notes:
        # - Pos and neg exemplars may be in our index.

        #
        # SVM model training
        #
        # Copy pos descriptors into a set for repeated iteration
        #: :type: set[smqtk.representation.DescriptorElement]
        pos = set(pos)
        # Creating training matrix and labels
        train_labels = []
        train_vectors = []
        num_pos = 0
        for d in pos:
            train_labels.append(+1)
            train_vectors.append(d.vector().tolist())
            num_pos += 1
        self._log.debug("Positives given: %d", num_pos)

        # When no negative examples are given, naively pick most distant example
        # in our dataset, using HI metric, for each positive example
        neg_autoselect = set()
        if not neg:
            self._log.info(
                "Auto-selecting negative examples. (%d per positive)",
                self._autoneg_select_ratio)
            # ``train_vectors`` only composed of positive examples at this point
            for p in pos:
                # where d is the distance vector to descriptor elements in cache
                d = histogram_intersection_distance(p.vector(),
                                                    self._descr_matrix)
                # Scan vector for max distance index
                # - Allow variable number of maximally distance descriptors to
                #   be picked per positive.
                m_set = {}  # track most distance neighbors
                m_val = -float(
                    'inf')  # track smallest distance of most distant neighbors
                for i in xrange(d.size):
                    if d[i] > m_val:
                        m_set[d[i]] = i
                        if len(m_set) > self._autoneg_select_ratio:
                            if m_val in m_set:
                                del m_set[m_val]
                        m_val = min(m_set)
                for i in m_set.itervalues():
                    neg_autoselect.add(self._descr_cache[i])
            # Remove any positive examples from auto-selected results
            neg_autoselect.difference_update(pos)
            self._log.debug("Auto-selected negative descriptors [%d]: %s",
                            len(neg_autoselect), neg_autoselect)

        num_neg = 0
        for d in neg:
            train_labels.append(-1)
            train_vectors.append(d.vector().tolist())
            num_neg += 1
        for d in neg_autoselect:
            train_labels.append(-1)
            train_vectors.append(d.vector().tolist())
            num_neg += 1

        if not num_pos:
            raise ValueError("No positive examples provided.")
        elif not num_neg:
            raise ValueError("No negative examples provided.")

        # Training SVM model
        self._log.debug("online model training")
        svm_problem = svm.svm_problem(train_labels, train_vectors)
        svm_model = svmutil.svm_train(
            svm_problem, self._gen_svm_parameter_string(num_pos, num_neg))
        if svm_model.l == 0:
            raise RuntimeError("SVM Model learning failed")

        #
        # Platt Scaling for probability rankings
        #

        self._log.debug("making test distance matrix")
        # Number of support vectors
        # Q: is this always the same as ``svm_model.l``?
        num_SVs = sum(svm_model.nSV[:svm_model.nr_class])
        # Support vector dimensionality
        dim_SVs = len(train_vectors[0])
        # initialize matrix they're going into
        svm_SVs = numpy.ndarray((num_SVs, dim_SVs), dtype=float)
        for i, nlist in enumerate(svm_model.SV[:svm_SVs.shape[0]]):
            svm_SVs[i, :] = [n.value for n in nlist[:len(train_vectors[0])]]
        # compute matrix of distances from support vectors to index elements
        # TODO: Optimize this step by caching SV distance vectors
        #       - It is known that SVs are vectors from the training data, so
        #           if the same descriptors are given to this function
        #           repeatedly (which is the case for IQR), this can be faster
        #           because we're only computing at most a few more distance
        #           vectors against our indexed descriptor matrix, and the rest
        #           have already been computed before.
        #       - At worst, we're effectively doing this call because each SV
        #           needs to have its distance vector computed.
        svm_test_k = compute_distance_matrix(svm_SVs,
                                             self._descr_matrix,
                                             histogram_intersection_distance,
                                             row_wise=True)

        self._log.debug("Platt scalling")
        # the actual platt scaling stuff
        weights = numpy.array(svm_model.get_sv_coef()).flatten()
        margins = numpy.dot(weights, svm_test_k)
        rho = svm_model.rho[0]
        probA = svm_model.probA[0]
        probB = svm_model.probB[0]
        #: :type: numpy.core.multiarray.ndarray
        probs = 1.0 / (1.0 + numpy.exp((margins - rho) * probA + probB))

        # Detect whether we need to flip probabilities
        # - Probability of input positive examples should have a high
        #   probability score among the generated probabilities of our index.
        # - If the positive example probabilities show to be in the lower 50%,
        #   flip the generated probabilities, since its experimentally known
        #   that the SVM will change which index it uses to represent a
        #   particular class label occasionally, which influences the Platt
        #   scaling apparently.
        pos_vectors = numpy.array(train_vectors[:num_pos])
        pos_test_k = compute_distance_matrix(svm_SVs,
                                             pos_vectors,
                                             histogram_intersection_distance,
                                             row_wise=True)
        pos_margins = numpy.dot(weights, pos_test_k)
        #: :type: numpy.core.multiarray.ndarray
        pos_probs = 1.0 / (1.0 +
                           numpy.exp((pos_margins - rho) * probA + probB))
        # Check if average positive probability is less than the average index
        # probability. If so, the platt scaling probably needs to be flipped.
        if (pos_probs.sum() / pos_probs.size) < (probs.sum() / probs.size):
            self._log.debug("inverting probabilities")
            probs = 1. - probs

        rank_pool = dict(zip(self._descr_cache, probs))
        return rank_pool
Exemplo n.º 42
0
    def rank(
            self, pos: Iterable[DescriptorElement],
            neg: Iterable[DescriptorElement]
    ) -> Dict[DescriptorElement, float]:
        """
        Rank the currently indexed elements given ``pos`` positive and ``neg``
        negative exemplar descriptor elements.

        :param pos: Iterable of positive exemplar DescriptorElement instances.
            This may be optional for some implementations.
        :type pos: collections.abc.Iterable[smqtk.representation.DescriptorElement]

        :param neg: Iterable of negative exemplar DescriptorElement instances.
            This may be optional for some implementations.
        :type neg: collections.abc.Iterable[smqtk.representation.DescriptorElement]

        :return: Map of indexed descriptor elements to a rank value between
            [0, 1] (inclusive) range, where a 1.0 means most relevant and 0.0
            meaning least relevant.
        :rtype: dict[smqtk.representation.DescriptorElement, float]

        """
        # Notes:
        # - Pos and neg exemplars may be in our index.

        #
        # SVM model training
        #
        # Copy pos descriptors into a set for repeated iteration
        pos_set: Set[DescriptorElement] = set(pos)
        # Creating training matrix and labels
        train_labels = []
        train_vectors: List = []
        num_pos = 0
        for desc_element in pos_set:
            train_labels.append(+1)
            if desc_element.vector() is not None:
                train_vectors.append(
                    desc_element.vector().tolist())  # type: ignore
            else:
                raise AttributeError
            num_pos += 1
        LOG.debug(f"Positives given: {num_pos}")

        # When no negative examples are given, naively pick most distant
        # example in our dataset, using HI metric, for each positive example
        neg_autoselect = set()
        # Copy neg descriptors into a set for testing size.
        if not isinstance(neg, collections.abc.Sized):
            #: :type: set[smqtk.representation.DescriptorElement]
            neg = set(neg)
        if not neg:
            LOG.info(f"Auto-selecting negative examples. \
                    ({self.autoneg_select_ratio} per positive")
            # ``train_vectors`` only composed of positive examples at this
            # point.
            for p in pos_set:
                # Where d is the distance vector to descriptor elements in
                # cache.
                d = histogram_intersection_distance(p.vector(),
                                                    self._descr_matrix)
                # Scan vector for max distance index
                # - Allow variable number of maximally distance descriptors to
                #   be picked per positive.
                # track most distance neighbors
                m_set = {}
                # track smallest distance of most distant neighbors
                m_val = -float('inf')
                for i in range(d.size):
                    if d[i] > m_val:
                        m_set[d[i]] = i
                        if len(m_set) > self.autoneg_select_ratio:
                            if m_val in m_set:
                                del m_set[m_val]
                        m_val = min(m_set)
                for i in six.itervalues(m_set):
                    neg_autoselect.add(self._descr_cache[i])
            # Remove any positive examples from auto-selected results
            neg_autoselect.difference_update(pos_set)
            LOG.debug(f"Auto-selected negative descriptors \
                    [{len(neg_autoselect)}]: {neg_autoselect}")
        num_neg = 0
        for n_iterable in (neg, neg_autoselect):
            for d in n_iterable:
                train_labels.append(-1)
                # noinspection PyTypeChecker
                train_vectors.append(d.vector().tolist())
                num_neg += 1

        if not num_pos:
            raise ValueError("No positive examples provided.")
        elif not num_neg:
            raise ValueError("No negative examples provided.")

        # Training SVM model
        LOG.debug("online model training")
        svm_problem = svm.svm_problem(train_labels, train_vectors)
        param_str = self._gen_svm_parameter_string(num_pos, num_neg)
        svm_param = svm.svm_parameter(param_str)
        svm_model = svmutil.svm_train(svm_problem, svm_param)

        if hasattr(svm_model, "param"):
            LOG.debug(f"SVM input parameters: {param_str}")
            LOG.debug(f"SVM model parsed parameters: {svm_model.param}")
            param = svm_model.param
            wgt_pairs = [(param.weight_label[i], param.weight[i])
                         for i in range(param.nr_weight)]
            wgt_str = " ".join(["%s: %s" % wgt for wgt in wgt_pairs])
            LOG.debug(f"SVM model parsed weight parameters: {wgt_str}")

        if svm_model.l == 0:  # noqa: E741
            raise RuntimeError("SVM Model learning failed")

        #
        # Platt Scaling for probability rankings
        #

        LOG.debug("making test distance matrix")
        # Number of support vectors
        # Q: is this always the same as ``svm_model.l``?
        num_SVs = sum(svm_model.nSV[:svm_model.nr_class])
        # Support vector dimensionality
        dim_SVs = len(train_vectors[0])
        # initialize matrix they're going into
        svm_SVs = numpy.ndarray((num_SVs, dim_SVs), dtype=float)
        for i, nlist in enumerate(svm_model.SV[:svm_SVs.shape[0]]):
            svm_SVs[i, :] = [n.value for n in nlist[:len(train_vectors[0])]]
        # compute matrix of distances from support vectors to index elements
        # TODO: Optimize this step by caching SV distance vectors
        #       - It is known that SVs are vectors from the training data, so
        #           if the same descriptors are given to this function
        #           repeatedly (which is the case for IQR), this can be faster
        #           because we're only computing at most a few more distance
        #           vectors against our indexed descriptor matrix, and the rest
        #           have already been computed before.
        #       - At worst, we're effectively doing this call because each SV
        #           needs to have its distance vector computed.
        svm_test_k = compute_distance_matrix(svm_SVs,
                                             self._descr_matrix,
                                             histogram_intersection_distance,
                                             row_wise=True)

        # TODO(john.moeller): None of the Platt scaling should be necessary.
        # svmutil.svm_predict will apply the Platt scaling directly. See
        # https://github.com/cjlin1/libsvm/tree/master/python

        LOG.debug("Platt scaling")
        # the actual platt scaling stuff
        weights = numpy.array(svm_model.get_sv_coef()).flatten()
        margins = numpy.dot(weights, svm_test_k)
        rho = svm_model.rho[0]
        probA = svm_model.probA[0]
        probB = svm_model.probB[0]
        #: :type: numpy.core.multiarray.ndarray
        probs = 1.0 / (1.0 + numpy.exp((margins - rho) * probA + probB))

        # Detect whether we need to flip probabilities
        # - Probability of input positive examples should have a high
        #   probability score among the generated probabilities of our index.
        # - If the positive example probabilities show to be in the lower 50%,
        #   flip the generated probabilities, since its experimentally known
        #   that the SVM will change which index it uses to represent a
        #   particular class label occasionally, which influences the Platt
        #   scaling apparently.
        pos_vectors = numpy.array(train_vectors[:num_pos])
        pos_test_k = compute_distance_matrix(svm_SVs,
                                             pos_vectors,
                                             histogram_intersection_distance,
                                             row_wise=True)
        pos_margins = numpy.dot(weights, pos_test_k)
        #: :type: numpy.core.multiarray.ndarray
        pos_probs = 1.0 / (1.0 +
                           numpy.exp((pos_margins - rho) * probA + probB))
        # Check if average positive probability is less than the average index
        # probability. If so, the platt scaling probably needs to be flipped.
        if (pos_probs.sum() / pos_probs.size) < (probs.sum() / probs.size):
            LOG.debug("inverting probabilities")
            probs = 1. - probs

        rank_pool = dict(zip(self._descr_cache, probs))
        return rank_pool
Exemplo n.º 43
0
    def train_SVR_Linear(self,
                         labels,
                         vectors,
                         verbose,
                         C_range,
                         callback=None):
        '''Private use only'''
        # combine the labels and vectors into one set.
        data = []
        for i in range(len(labels)):
            data.append([labels[i], vectors[i]])

        #shuffle the data
        rng = random.Random()
        if self.random_seed != None:
            rng.seed(self.random_seed)
        rng.shuffle(data)

        # partition into validation and training
        if type(
                self.validation_size
        ) == float and self.validation_size > 0.0 and self.validation_size < 1.0:
            training_cutoff = int(len(data) * (1.0 - self.validation_size))
        elif type(self.validation_size
                  ) == int and self.validation_size < len(labels):
            training_cutoff = len(labels) - self.validation_size
        else:
            raise NotImplementedError(
                "Cannot determine validation set from %s" %
                self.validation_size)

        if verbose: print "Training Cutoff:", len(labels), training_cutoff
        training_data = data[:training_cutoff]
        validation_data = data[training_cutoff:]

        tmp_labels = []
        tmp_vectors = []
        for each in training_data:
            tmp_labels.append(each[0])
            tmp_vectors.append(each[1])

        prob = svm.svm_problem(tmp_labels, tmp_vectors)

        training_info = []
        training_svm = []
        training_table = Table()
        self.training_table = training_table
        i = 0
        for C in C_range:

            param = svm.svm_parameter(svm_type=self.svm_type,
                                      kernel_type=svm.LINEAR,
                                      C=C,
                                      p=self.epsilon,
                                      nu=self.nu)

            test_svm = svm.svm_model(prob, param)

            mse = 0.0
            total = len(validation_data)
            for label, vector in validation_data:
                pred = test_svm.predict(vector)
                error = label - pred
                mse += error * error
            mse = mse / total

            training_svm.append(test_svm)
            training_info.append([C, mse])
            training_table.setElement(i, 'C', C)
            training_table.setElement(i, 'mse', mse)
            i += 1

            if callback != None:
                callback(int(100 * float(i) / len(C_range)))

        if verbose: print
        if verbose: print "------------------------------"
        if verbose: print " Tuning Information:"
        if verbose: print "         C   error"
        if verbose: print "------------------------------"
        best = training_info[0]
        best_svm = training_svm[0]
        for i in range(len(training_info)):
            each = training_info[i]
            if verbose: print " %8.3e  %0.8f" % (each[0], each[1])
            if best[-1] > each[-1]:
                best = each
                best_svm = training_svm[i]
        if verbose: print "------------------------------"
        if verbose: print
        if verbose: print "------------------------------"
        if verbose: print " Best Tuning:"
        if verbose: print "         C   error"
        if verbose: print "------------------------------"
        if verbose: print " %8.3e  %0.8f" % (best[0], best[1])
        if verbose: print "------------------------------"
        if verbose: print
        self.training_info = training_info
        self.C = best[0]
        self.error = best[1]

        self.svm = best_svm
Exemplo n.º 44
0
    def train_SVR_Linear(self,labels,vectors,verbose, C_range, callback=None):
        '''Private use only'''
        # combine the labels and vectors into one set.
        data = []
        for i in range(len(labels)):
            data.append([labels[i],vectors[i]])
            
        #shuffle the data
        rng = random.Random()
        if self.random_seed != None:
            rng.seed(self.random_seed)
        rng.shuffle(data)
                
        # partition into validation and training
        if type(self.validation_size) == float and self.validation_size > 0.0 and self.validation_size < 1.0:
            training_cutoff = int(len(data)*(1.0-self.validation_size))
        elif type(self.validation_size) == int and self.validation_size < len(labels):
            training_cutoff = len(labels)-self.validation_size
        else:
            raise NotImplementedError("Cannot determine validation set from %s"%self.validation_size)
            
        if verbose: print "Training Cutoff:",len(labels),training_cutoff
        training_data = data[:training_cutoff]
        validation_data = data[training_cutoff:]
        
        tmp_labels = []
        tmp_vectors = []
        for each in training_data:
            tmp_labels.append(each[0])
            tmp_vectors.append(each[1])
        
        prob = svm.svm_problem(tmp_labels,tmp_vectors)
        
        training_info = []
        training_svm = []
        training_table = Table()
        self.training_table = training_table
        i=0
        for C in C_range:
                
            param = svm.svm_parameter(svm_type=self.svm_type,kernel_type = svm.LINEAR, C = C, p=self.epsilon,nu=self.nu)
                
            test_svm = svm.svm_model(prob, param)
                
            mse = 0.0
            total = len(validation_data)
            for label,vector in validation_data:
                pred = test_svm.predict(vector)
                error = label - pred
                mse += error*error
            mse = mse/total
 
            training_svm.append(test_svm)
            training_info.append([C,mse])
            training_table.setElement(i,'C',C)
            training_table.setElement(i,'mse',mse)
            i+=1

            if callback != None:
                callback(int(100*float(i)/len(C_range)))
                
        if verbose: print 
        if verbose: print "------------------------------"
        if verbose: print " Tuning Information:"
        if verbose: print "         C   error"
        if verbose: print "------------------------------"
        best = training_info[0]
        best_svm = training_svm[0]
        for i in range(len(training_info)):
            each = training_info[i]
            if verbose: print " %8.3e  %0.8f"%(each[0],each[1])
            if best[-1] > each[-1]:
                best = each
                best_svm = training_svm[i]
        if verbose: print "------------------------------"
        if verbose: print 
        if verbose: print "------------------------------"
        if verbose: print " Best Tuning:"
        if verbose: print "         C   error"
        if verbose: print "------------------------------"
        if verbose: print " %8.3e  %0.8f"%(best[0],best[1])
        if verbose: print "------------------------------"
        if verbose: print
        self.training_info = training_info
        self.C     = best[0]
        self.error = best[1]

        self.svm = best_svm
Exemplo n.º 45
0
def main(args):
	paramsfn = args[0]
	exec(open(paramsfn,'r').read())
	
	
	if len(args) > 1:
		gammarange = [float(args[1])]
		crange = [float(args[2])]
	
	output,input,fieldnames,fold_inds = load_data(datafilename,use_specific_fold_inds)
	sep_validation = False
	if separate_validation_set != '':
		output_valid,input_valid,fieldnames,fold_inds_valid = load_data(separate_validation_set,use_specific_fold_inds)
		sep_validation = True


	fold_start = [-1]
	if sep_validation:
		fold_start_valid = [-1]
	
	if use_specific_fold_inds:
		unique_fold_ids = unique(fold_inds)
		row_inds = []
		outputcopy = []
		inputcopy = zeros([size(input,0),size(input,1)],dtype='float64')
		fold_start = [0]
		
		curind = 0
		for ind in unique_fold_ids:
			row_inds = [i for i in xrange(len(fold_inds)) if fold_inds[i] == ind]
			inputcopy[curind:curind+len(row_inds),:] = input[row_inds,:]
			outputcopy.extend([output[i] for i in row_inds])
			curind += len(row_inds)
			
			fold_start.append(fold_start[-1]+len(row_inds))
		input = inputcopy
		output = outputcopy
		nf = len(fold_start)-1
		
		if sep_validation:
			unique_fold_ids_valid = unique(fold_inds_valid)
			row_inds = []
			outputcopy = []
			inputcopy = zeros([size(input_valid,0),size(input_valid,1)],dtype='float64')
			fold_start_valid = [0]
			
			curind = 0
			for ind in unique_fold_ids_valid:
				row_inds = [i for i in xrange(len(fold_inds_valid)) if fold_inds_valid[i] == ind]
				inputcopy[curind:curind+len(row_inds),:] = input_valid[row_inds,:]
				outputcopy.extend([output_valid[i] for i in row_inds])
				curind += len(row_inds)
				
				fold_start_valid.append(fold_start_valid[-1]+len(row_inds))
			input_valid = inputcopy
			output_valid = outputcopy
			nf = len(fold_start_valid)-1
		
	if binarizeoutput:
		output,boundary = binarize_output(output,binary_threshold,binary_boundary_type)
	
	
	if testdatafilename != '':
		output_test,input_test,fieldnames,fold_inds_test = load_data(testdatafilename,False)
		if binarizeoutput:
			output_test = [1 if x > boundary else -1 for x in output_test]
	
	
	if doscale:
		maxinput = input.max(0);
		mininput = input.min(0);
		input = (input-mininput)/(maxinput-mininput)
		
		if testdatafilename != '':
			input_test = (input_test-mininput)/(maxinput-mininput)

		if savemodel:
			save_scale_data(datafilename+'_scales.dat',maxinput,mininput)
			
		if sep_validation:
			input_valid = (input_valid-mininput)/(maxinput-mininput)



	if donormalize:
		means = input.mean(0)
		stds = sqrt(input.var(0))
		input = (input-means)/stds
		if testdatafilename != '':
			input_test = (input_test-means)/stds

		if savemodel:
			save_zscore_data(datafilename+'_meansstdevs.dat',means,stds)
	
		if sep_validation:
			input_valid = (input_valid-means)/stds
		
	if numcpus == 'auto':
		p = Pool()
	else:
		p = Pool(numcpus)
	
	
	if choose_specific_features:
		if choose_specific_features_increasing:
			specific_selected_features = [specific_selected_features[:i] for i in xrange(2,len(specific_selected_features),2)]
			
		for specific_selected_choice in specific_selected_features:
			inputfiltered = input[:,specific_selected_choice]
			if sep_validation:
				inputfiltered_valid = input_valid[:,specific_selected_choice]

			if dopca:
				coeff,temp,latent = princomp(inputfiltered)

				if savemodel:
					save_pca_coeffs(datafilename+'_pcacoeffs.dat',coeff,mean(inputfiltered.T,axis=1))
				inputfiltered = temp
				if sep_validation:
					return
							
			with Timer():
			
				if sep_validation:
					if use_specific_fold_inds:
						results = mygrid.grid_classify_sepvalid (crange,gammarange,output,[list(x) for x in inputfiltered],output_valid,[list(x) for x in inputfiltered_valid],nf,useprob,timeout,p,fold_start,fold_start_valid)
					else:
						results = mygrid.grid_classify_sepvalid (crange,gammarange,output,[list(x) for x in inputfiltered],output_valid,[list(x) for x in inputfiltered_valid],nf,useprob,timeout,p)
				else:
					if use_specific_fold_inds:
						results = mygrid.grid_classify (crange,gammarange,output,[list(x) for x in inputfiltered],nf,useprob,timeout,p,fold_start)
					else:
						results = mygrid.grid_classify (crange,gammarange,output,[list(x) for x in inputfiltered],nf,useprob,timeout,p)

				
			param = svm.svm_parameter('-c %g -g %g -b %d' % (results[-2],results[-1],int(useprob)))
			
			prob = svm.svm_problem(output, [list(x) for x in inputfiltered])
			fold_start_p = (c_int *len(fold_start))()
			for i in xrange(len(fold_start)):
				fold_start_p[i] = fold_start[i]
			if posclass == 'auto':
				posclass = output[0]

			if sep_validation:
				prob_valid = svm.svm_problem(output_valid, [list(x) for x in inputfiltered_valid])
				testlength = prob_valid.l
				fold_start_p_valid = (c_int *len(fold_start_valid))()
				for i in xrange(len(fold_start_valid)):
					fold_start_p_valid[i] = fold_start_valid[i]
			else:
				testlength = prob.l	
								
			target = (c_double * testlength)()
										
			#[maxauc,maxoptacc,maxphi,minfpfnration,maxf1,optbias,optc,optgamma]
			
			if sep_validation:
				libsvm.svm_cross_validation_sepsets(prob, prob_valid,fold_start_p, fold_start_p_valid,param, nf, target)
			else:
				libsvm.svm_cross_validation(prob, fold_start_p, param, nf, target)
				
				
			if sep_validation:
				ys = prob_valid.y[:testlength]
			else:
				ys = prob.y[:prob.l]
				
			db = array([[ys[i],target[i]] for i in range(testlength)])
				
			
			neg = len([x for x in ys if x != posclass])
			pos = testlength-neg;

			if len(specific_selected_features) == 1 or True:
				pdfpages = PdfPages('%s_train.pdf' % (outputlog))
#				auc,topacc,optaccbias,topphi,optphibias,top_tps_bias,top_fps = mygrid.calc_AUC(db,neg,pos,posclass,useprob,[],True,pdfpages,'Optimal Cross-Validation ROC curve')
				topacc,topphi,minfpfnratio,topf1,auc,optbias = mygrid.optimize_results(db,neg,pos,posclass,'F1')
				print [topacc,results[1]]
				print [topphi,results[2]]
				print [topf1,results[4]]
				print [auc,results[0]]
				pdfpages.close()
#				print target
				if sep_validation:
					ACC,PHI,confusionmatrix = mygrid.evaluations_classify(output_valid,target,posclass,results[-3])
				else:
					ACC,PHI,confusionmatrix = mygrid.evaluations_classify(output,target,posclass,results[-3])
				if posclass == 1:
					negclass = 0;
				else:
					negclass = 1;

				numpred_pos = confusionmatrix[0,0]+confusionmatrix[1,0]
				numpred_neg = confusionmatrix[0,1]+confusionmatrix[1,1]
				
				N = pos+neg
				probchance = (numpred_pos*pos+numpred_neg*neg)*1.0/(N*N)
				kappa = (topacc-probchance)*1.0/(1-probchance);
				
				print 'Train optimized accuracy = %g' % (topacc)
				print 'Train optimized Phi statistic = %g' % (topphi)
				print 'Train optimized kappa = %g' % (kappa)
				print 'Train optimized F1 score = %f' % (topf1)
				print 'Train optimized TP/RECALL = %g, FP = %g, PRECISION = %g' % (confusionmatrix[0,0]/pos,confusionmatrix[1,0]/neg,confusionmatrix[0,0]/(confusionmatrix[0,0]+confusionmatrix[1,0]))
				print '================================'
				print '||   ||%6d |%6d |       ||' % (posclass,negclass)
				print '================================'
				print '||%3d||%6g |%6g |%6g ||' % (posclass,confusionmatrix[0,0],confusionmatrix[0,1],pos)#confusionmatrix[0,0]+confusionmatrix[0,1])
				print '||%3d||%6g |%6g |%6g ||' % (negclass,confusionmatrix[1,0],confusionmatrix[1,1],neg)#confusionmatrix[1,0]+confusionmatrix[1,1])
				print '||----------------------------||'
				print '||   ||%6g |%6g |%6g ||' % (confusionmatrix[0,0]+confusionmatrix[1,0],confusionmatrix[0,1]+confusionmatrix[1,1],pos+neg)#confusionmatrix[1,0]+confusionmatrix[1,1])
				print '================================'
				
				
			else:
				auc,topacc,optaccbias,topphi,optphibias,top_tps_bias,top_fps = mygrid.calc_AUC(db,neg,pos,posclass,useprob,[],False,0,'Optimal Cross-Validation ROC curve')
			
			print 'Optimal gamma = %g\nOptimal c = %g\nOptimal Bias = %g' % (results[-1],results[-2],results[-3])
			print 'Top CV results: AUC = %g, OPTIMIZED ACC = %g, OPTIMIZED PHI = %g' % (auc,topacc,topphi)

			if outputlog != '':
				fout = open(outputlog,'a')
				print >> fout, '========================='
				print >> fout, datafilename
				print >> fout, doscale, donormalize, dopca, '(scale/norm/pca)'
				print >> fout, crange[0],crange[-1], gammarange[0], gammarange[-1], '(cs,gammas)'
				print >> fout, use_specific_fold_inds, nf, '(use specific folds, numfold)'
				print >> fout, 'SPECIFIC FIELDS:'
				print >> fout, specific_selected_choice
				if fieldnames != []:
					for i in specific_selected_choice:
						print >> fout, fieldnames[i],
					print >> fout
				print >> fout, 'train: '
				print >> fout, '    AUC=%g,ACC=%g,kappa=%g,phi=%g,f1=%g (g=%g,c=%g,bias=%g)' % (auc,topacc,kappa,topphi,topf1,results[-1],results[-2],results[-3])
				print >> fout, '    ||%3d||%6g |%6g |%6g ||' % (posclass,confusionmatrix[0,0],confusionmatrix[0,1],pos)#confusionmatrix[0,0]+confusionmatrix[0,1])
				print >> fout, '    ||%3d||%6g |%6g |%6g ||' % (negclass,confusionmatrix[1,0],confusionmatrix[1,1],neg)#confusionmatrix[1,0]+confusionmatrix[1,1])
				fout.close()
			
			if outputpredictions:
				fout = open(predictionslog,'w')
				if sep_validation:
					for ind in xrange(len(output_valid)):
						label = output_valid[ind]
						value = target[ind]
						oneinputrow = input_valid[ind,:]
						print >> fout, value, label,
						
						for j in xrange(len(oneinputrow)):
							print >> fout, '%d:%f' % (j+1,oneinputrow[j]),
						print >> fout
				else:
					for ind in xrange(len(output)):
						label = output[ind]
						value = target[ind]
						oneinputrow = input[ind,:]
						print >> fout, value, label,
						
						for j in xrange(len(oneinputrow)):
							print >> fout, '%d:%f' % (j+1,oneinputrow[j]),
						print >> fout
				fout.close()
			
			del target
		
				
			if savemodel:
				param = ('-c %g -g %g -b %d' % (results[-2],results[-1],int(useprob)))
				m = svm_train(output,[list(x) for x in inputfiltered],param)
				svm_save_model(datafilename + '.model',m)
				
				
			
			if testdatafilename != '':
				inputfiltered_test = input_test[:,specific_selected_choice]
				if dopca:
					M = (inputfiltered_test-mean(inputfiltered_test.T,axis=1)).T # subtract the mean (along columns)
					inputfiltered_test = dot(coeff.T,M).T # projection of the data in the new space

				param = ('-c %g -g %g -b %d' % (results[-2],results[-1],int(useprob)))
				m = svm_train(output,[list(x) for x in inputfiltered],param)
				pred_labels, (ACC, MSE, SCC), pred_values = svm_predict(output_test,[list(x) for x in inputfiltered_test],m,'-b %d' % (int(useprob)))
				ACC,PHI,confusionmatrix = mygrid.evaluations_classify(output_test, [x[0] for x in pred_values],posclass,results[-3])
				db = array([[output_test[i],pred_values[i][0]] for i in range(len(output_test))])
				neg = len([x for x in output_test if x != posclass])
				pos = len(output_test)-neg

				auctest = 0				
				if neg != 0 and pos != 0:
					auctest,topacctest,optaccbias,topphitest,optphibias,top_tps_bias,top_fps = mygrid.calc_AUC(db,neg,pos,posclass,useprob,[],False,pdfpages,'Test ROC curve',results[-3])

				numpred_pos = confusionmatrix[0,0]+confusionmatrix[1,0]
				numpred_neg = confusionmatrix[0,1]+confusionmatrix[1,1]
				
				N = pos+neg
				probchance = (numpred_pos*pos+numpred_neg*neg)*1.0/(N*N)
				testkappa = (ACC/100.0-probchance)*1.0/(1-probchance);

				
				print 'Test optimized accuracy = %g' % (ACC)
				print 'Test optimized Phi statistic = %g' % (PHI)
				print 'Test optimized kappa = %g' % (testkappa)
				print '================================'
				print '||   ||%6d |%6d |       ||' % (m.get_labels()[0],m.get_labels()[1])
				print '================================'
				print '||%3d||%6g |%6g |%6g ||' % (m.get_labels()[0],confusionmatrix[0,0],confusionmatrix[0,1],pos)#confusionmatrix[0,0]+confusionmatrix[0,1])
				print '||%3d||%6g |%6g |%6g ||' % (m.get_labels()[1],confusionmatrix[1,0],confusionmatrix[1,1],neg)#confusionmatrix[1,0]+confusionmatrix[1,1])
				print '||----------------------------||'
				print '||   ||%6g |%6g |%6g ||' % (confusionmatrix[0,0]+confusionmatrix[1,0],confusionmatrix[0,1]+confusionmatrix[1,1],pos+neg)#confusionmatrix[1,0]+confusionmatrix[1,1])
				print '================================'


				if outputlog != '':
					fout = open(outputlog,'a')
	
					print >> fout, 'test: '
					print >> fout, '   ACC=%g,AUC=%g,kappa=%g,phi=%g' % (ACC,auctest,testkappa,PHI)
					print >> fout, '   ||%3d||%6g |%6g |%6g ||' % (m.get_labels()[0],confusionmatrix[0,0],confusionmatrix[0,1],pos)#confusionmatrix[0,0]+confusionmatrix[0,1])
					print >> fout, '   ||%3d||%6g |%6g |%6g ||' % (m.get_labels()[1],confusionmatrix[1,0],confusionmatrix[1,1],neg)#confusionmatrix[1,0]+confusionmatrix[1,1])
	
					fout.close()
	else:
		
		with Timer():
			if use_specific_fold_inds:
				results = mygrid.grid_classify (crange,gammarange,output,[list(x) for x in input],nf,useprob,timeout,p,fold_start)
			else:
				results = mygrid.grid_classify (crange,gammarange,output,[list(x) for x in input],nf,useprob,timeout,p)

		param = svm.svm_parameter('-c %g -g %g -b %d' % (results[-2],results[-1],int(useprob)))
		prob = svm.svm_problem(output, [list(x) for x in input])
		target = (c_double * prob.l)()
		fold_start_p = (c_int *len(fold_start))()
		for i in xrange(len(fold_start)):
			fold_start_p[i] = fold_start[i]
		
		if posclass == 'auto':
			posclass = output[0]
			
		libsvm.svm_cross_validation(prob, fold_start_p, param, nf, target)
		ys = prob.y[:prob.l]
		db = [[ys[i],target[i]] for i in range(prob.l)]
		db = array(db)
		neg = len([x for x in ys if x != posclass])
		pos = prob.l-neg;
		
		pdfpages = PdfPages('%s_train.pdf' % (outputlog))
		auc,topacc,optaccbias,topphi,optphibias,top_tps_bias,top_fps = mygrid.calc_AUC(db,neg,pos,posclass,useprob,[],True,pdfpages,'Optimal Cross-Validation ROC curve')
		pdfpages.close()
		ACC,PHI,confusionmatrix = mygrid.evaluations_classify(output, target,posclass,results[-3])
		if posclass == 1:
			negclass = 0;
		else:
			negclass = 1;
			
		print 'Train optimized accuracy = %g' % (topacc)
		print 'Train optimized phi statististic = %g' % (topphi)
		print 'TP/RECALL = %g, FP = %g, PRECISION = %g' % (confusionmatrix[0,0]/pos,confusionmatrix[1,0]/neg,confusionmatrix[0,0]/(confusionmatrix[0,0]+confusionmatrix[1,0]))
		print '================================'
		print '||   ||%6d |%6d |       ||' % (posclass,negclass)
		print '================================'
		print '||%3d||%6g |%6g |%6g ||' % (posclass,confusionmatrix[0,0],confusionmatrix[0,1],pos)#confusionmatrix[0,0]+confusionmatrix[0,1])
		print '||%3d||%6g |%6g |%6g ||' % (negclass,confusionmatrix[1,0],confusionmatrix[1,1],neg)#confusionmatrix[1,0]+confusionmatrix[1,1])
		print '||----------------------------||'
		print '||   ||%6g |%6g |%6g ||' % (confusionmatrix[0,0]+confusionmatrix[1,0],confusionmatrix[0,1]+confusionmatrix[1,1],pos+neg)#confusionmatrix[1,0]+confusionmatrix[1,1])
		print '================================'
		
		if outputpredictions:
			fout = open(predictionslog,'w')
			for ind in xrange(len(output)):
				label = output[ind]
				value = target[ind]
				oneinputrow = input[ind,:]
				print >> fout, value, label,
				
				for j in xrange(len(oneinputrow)):
					print >> fout, '%d:%f' % (j+1,oneinputrow[j]),
				print >> fout
			fout.close()
		del target
		
		print 'Optimal gamma = %g\nOptimal c = %g\nOptimal Bias = %g' % (results[-1],results[-2],optphibias)
		print 'Top CV results: AUC = %g, OPTIMIZED ACC = %g, OPTIMIZED PHI = %g' % (auc,topacc,topphi)
		if savemodel:
			param = ('-c %g -g %g -b %d' % (results[-2],results[-1],int(useprob)))
			m = svm_train(output,[list(x) for x in input],param)
			svm_save_model(datafilename+'.model',m)
		
		if testdatafilename != '':
			param = ('-c %g -g %g -b %d' % (results[-2],results[-1],int(useprob)))
			m = svm_train(output,[list(x) for x in input],param)

			pred_labels, (ACC, MSE, SCC), pred_values = svm_predict(output_test,[list(x) for x in input_test],m,'-b %d' % (int(useprob)))
			ACC,PHI,confusionmatrix = mygrid.evaluations_classify(output_test, [x[0] for x in pred_values],posclass,results[-3])

			db = array([[output_test[i],pred_values[i][0]] for i in range(len(output_test))])
			neg = len([x for x in output_test if x != posclass])
			pos = len(output_test)-neg;
			pdfpages = PdfPages('%s_test.pdf' % (outputlog))
			auctest = 0
			if neg != 0 and pos != 0:
				auctest,topacctest,optaccbias,topphitest,optphibias,top_tps_bias,top_fps = mygrid.calc_AUC(db,neg,pos,posclass,useprob,[],True,pdfpages,'Test ROC curve',results[-3])
			pdfpages.close()
			
			print 'Test accuracy = %g' % (ACC)
			print 'Test Phi statistic = %g' % (PHI)
			print 'TP/RECALL = %g, FP = %g, PRECISION = %g' % (confusionmatrix[0,0]/pos,confusionmatrix[1,0]/neg,confusionmatrix[0,0]/(confusionmatrix[0,0]+confusionmatrix[1,0]))
			print '================================'
			print '||   ||%6d |%6d |       ||' % (m.get_labels()[0],m.get_labels()[1])
			print '================================'
			print '||%3d||%6g |%6g |%6g ||' % (m.get_labels()[0],confusionmatrix[0,0],confusionmatrix[0,1],pos)#confusionmatrix[0,0]+confusionmatrix[0,1])
			print '||%3d||%6g |%6g |%6g ||' % (m.get_labels()[1],confusionmatrix[1,0],confusionmatrix[1,1],neg)#confusionmatrix[1,0]+confusionmatrix[1,1])
			print '||----------------------------||'
			print '||   ||%6g |%6g |%6g ||' % (confusionmatrix[0,0]+confusionmatrix[1,0],confusionmatrix[0,1]+confusionmatrix[1,1],pos+neg)#confusionmatrix[1,0]+confusionmatrix[1,1])
			print '================================'

		
		if outputlog != '':
			fout = open(outputlog,'a')
			print >> fout, '========================='
			print >> fout, fieldnames
			print >> fout, 'train: AUC=%g,ACC=%g,PHI=%g (g=%g,c=%g,bias=%g)' % (auc,topacc,topphi,results[-1],results[-2],results[-3])
			if testdatafilename != '':
				print >> fout, 'test: ACC=%g,AUC=%g,PHI=%g' % (ACC,auctest,PHI)
			fout.close()
Exemplo n.º 46
0
    def train(self, class_examples=None, **kwds):
        """
        Train the supervised classifier model.

        If a model is already loaded, we will raise an exception in order to
        prevent accidental overwrite.

        If the same label is provided to both ``class_examples`` and ``kwds``,
        the examples given to the reference in ``kwds`` will prevail.

        :param class_examples: Dictionary mapping class labels to iterables of
            DescriptorElement training examples.
        :type class_examples: dict[collections.Hashable,
                 collections.Iterable[smqtk.representation.DescriptorElement]]

        :param kwds: Keyword assignment of labels to iterables of
            DescriptorElement training examples.
        :type kwds: dict[str,
                 collections.Iterable[smqtk.representation.DescriptorElement]]

        :raises ValueError: There were no class examples provided.
        :raises ValueError: Less than 2 classes were given.
        :raises RuntimeError: A model already exists in this instance.Following
            through with training would overwrite this model. Throwing an
            exception for information protection.

        """
        class_examples = \
            super(LibSvmClassifier, self).train(class_examples, **kwds)

        # Offset from 0 for positive class labels to use
        # - not using label of 0 because we think libSVM wants positive labels
        CLASS_LABEL_OFFSET = 1

        # Stuff for debug reporting
        etm_ri = None
        param_debug = {'-q': ''}
        if self._log.getEffectiveLevel() <= logging.DEBUG:
            etm_ri = 1.0
            param_debug = {}

        # Form libSVM problem input values
        self._log.debug("Formatting problem input")
        train_labels = []
        train_vectors = []
        train_group_sizes = []  # number of examples per class
        self.svm_label_map = {}
        # Making SVM label assignment deterministic to alphabetic order
        for i, l in enumerate(sorted(class_examples), CLASS_LABEL_OFFSET):
            # Map integer SVM label to semantic label
            self.svm_label_map[i] = l

            self._log.debug('-- class %d (%s)', i, l)
            # requires a sequence, so making the iterable ``g`` a tuple
            g = class_examples[l]
            if not isinstance(g, collections.Sequence):
                g = tuple(g)

            train_group_sizes.append(float(len(g)))
            x = elements_to_matrix(g, report_interval=etm_ri)
            x = self._norm_vector(x)
            train_labels.extend([i] * x.shape[0])
            train_vectors.extend(x.tolist())
            del g, x

        assert len(train_labels) == len(train_vectors), \
            "Count miss-match between parallel labels and descriptor vectors" \
            "being sent to libSVM (%d != %d)" \
            % (len(train_labels), len(train_vectors))

        self._log.debug("Forming train params")
        #: :type: dict
        params = deepcopy(self.train_params)
        params.update(param_debug)
        # Calculating class weights for C-SVC SVM
        if '-s' not in params or int(params['-s']) == 0:
            total_examples = sum(train_group_sizes)
            for i, n in enumerate(train_group_sizes, CLASS_LABEL_OFFSET):
                # weight is the ratio of between number of other-class examples
                # to the number of examples in this class.
                other_class_examples = total_examples - n
                w = max(1.0, other_class_examples / float(n))
                params['-w' + str(i)] = w
                self._log.debug("-- class '%s' weight: %s",
                                self.svm_label_map[i], w)

        self._log.debug("Making parameters obj")
        svm_params = svmutil.svm_parameter(self._gen_param_string(params))
        self._log.debug("Creating SVM problem")
        svm_problem = svm.svm_problem(train_labels, train_vectors)
        self._log.debug("Training SVM model")
        self.svm_model = svmutil.svm_train(svm_problem, svm_params)
        self._log.debug("Training SVM model -- Done")

        if self.svm_label_map_fp:
            self._log.debug("saving file -- labels -- %s",
                            self.svm_label_map_fp)
            with open(self.svm_label_map_fp, 'wb') as f:
                cPickle.dump(self.svm_label_map, f, -1)
        if self.svm_model_fp:
            self._log.debug("saving file -- model -- %s", self.svm_model_fp)
            svmutil.svm_save_model(self.svm_model_fp, self.svm_model)
Exemplo n.º 47
0
    def train(self, class_examples=None, **kwds):
        """
        Train the supervised classifier model.

        If a model is already loaded, we will raise an exception in order to
        prevent accidental overwrite.

        If the same label is provided to both ``class_examples`` and ``kwds``,
        the examples given to the reference in ``kwds`` will prevail.

        :param class_examples: Dictionary mapping class labels to iterables of
            DescriptorElement training examples.
        :type class_examples: dict[collections.Hashable,
                 collections.Iterable[smqtk.representation.DescriptorElement]]

        :param kwds: Keyword assignment of labels to iterables of
            DescriptorElement training examples.
        :type kwds: dict[str,
                 collections.Iterable[smqtk.representation.DescriptorElement]]

        :raises ValueError: There were no class examples provided.
        :raises ValueError: Less than 2 classes were given.
        :raises RuntimeError: A model already exists in this instance.Following
            through with training would overwrite this model. Throwing an
            exception for information protection.

        """
        class_examples = \
            super(LibSvmClassifier, self).train(class_examples, **kwds)

        # Offset from 0 for positive class labels to use
        # - not using label of 0 because we think libSVM wants positive labels
        CLASS_LABEL_OFFSET = 1

        # Stuff for debug reporting
        etm_ri = None
        param_debug = {'-q': ''}
        if self._log.getEffectiveLevel() <= logging.DEBUG:
            etm_ri = 1.0
            param_debug = {}

        # Form libSVM problem input values
        self._log.debug("Formatting problem input")
        train_labels = []
        train_vectors = []
        train_group_sizes = []  # number of examples per class
        self.svm_label_map = {}
        # Making SVM label assignment deterministic to alphabetic order
        for i, l in enumerate(sorted(class_examples), CLASS_LABEL_OFFSET):
            # Map integer SVM label to semantic label
            self.svm_label_map[i] = l

            self._log.debug('-- class %d (%s)', i, l)
            # requires a sequence, so making the iterable ``g`` a tuple
            g = class_examples[l]
            if not isinstance(g, collections.Sequence):
                self._log.debug('   (expanding iterable into sequence)')
                g = tuple(g)

            train_group_sizes.append(float(len(g)))
            x = elements_to_matrix(g, report_interval=etm_ri)
            x = self._norm_vector(x)
            train_labels.extend([i] * x.shape[0])
            train_vectors.extend(x.tolist())
            del g, x

        assert len(train_labels) == len(train_vectors), \
            "Count miss-match between parallel labels and descriptor vectors" \
            "being sent to libSVM (%d != %d)" \
            % (len(train_labels), len(train_vectors))

        self._log.debug("Forming train params")
        #: :type: dict
        params = deepcopy(self.train_params)
        params.update(param_debug)
        # Calculating class weights for C-SVC SVM
        if '-s' not in params or int(params['-s']) == 0:
            total_examples = sum(train_group_sizes)
            for i, n in enumerate(train_group_sizes, CLASS_LABEL_OFFSET):
                # weight is the ratio of between number of other-class examples
                # to the number of examples in this class.
                other_class_examples = total_examples - n
                w = max(1.0, other_class_examples / float(n))
                params['-w' + str(i)] = w
                self._log.debug("-- class '%s' weight: %s",
                                self.svm_label_map[i], w)

        self._log.debug("Making parameters obj")
        svm_params = svmutil.svm_parameter(self._gen_param_string(params))
        self._log.debug("Creating SVM problem")
        svm_problem = svm.svm_problem(train_labels, train_vectors)
        del train_vectors
        self._log.debug("Training SVM model")
        self.svm_model = svmutil.svm_train(svm_problem, svm_params)
        self._log.debug("Training SVM model -- Done")

        if self.svm_label_map_fp:
            self._log.debug("saving file -- labels -- %s",
                            self.svm_label_map_fp)
            with open(self.svm_label_map_fp, 'wb') as f:
                cPickle.dump(self.svm_label_map, f, -1)
        if self.svm_model_fp:
            self._log.debug("saving file -- model -- %s", self.svm_model_fp)
            svmutil.svm_save_model(self.svm_model_fp, self.svm_model)
Exemplo n.º 48
0
'''plot datas'''
pyplot.plot(data_pos[:, 0], data_pos[:, 1], 'r.')
pyplot.plot(data_neg[:, 0], data_neg[:, 1], 'answers.')
pyplot.xlim(-2.5, 2.5)
pyplot.ylim(-2, 2)

'''plot items margin'''
angles_circle = [i*pi/180 for i in range(0,360)]                 #iÏÈת»»³Édouble
#angles_circle = [line/np.pi for line in np.arange(0,360)]             # <=>
# angles_circle = [line/180*pi for line in np.arange(0,360)]    X
x = cos(angles_circle)
y = sin(angles_circle)
pyplot.plot(x, y, 'r')
pyplot.plot(2*x, 2*y, 'answers')
pyplot.show()

'''build a vec for classification'''
data = np.append(data_pos, data_neg, axis = 0)                  #merge 2 ndarray datas into 1axis = 0!!!
# print(items)
# items = [data_pos, data_neg]    X
data = data.tolist()                                            #transform ndarray datas into list
# print(items)
data_label = ones( (pos_dot_num + neg_dot_num, 1) )
data_label[11:20] = -1
prob = svm_problem(data_label, data)                            #items & data_label must be list
param = svm_parameter('-c 100 -g 4')
# print(param)

model = svm_train(prob, param)

Exemplo n.º 49
0
            # split the line on commas, last element is truth label
            values = line.rstrip().split(',')
            arr = []
            # convert elements to floats
            for i in range(0, 4):
                arr.append(float(values[i]))
            x.append(arr)
            if values[4] not in names_to_integers:
                raise TruthLabelMissingInDictError("Could not find \"" +
                                                   values[4] + "\" in file")
            y.append(names_to_integers[values[4]])
            #print values
    return x, y


if __name__ == "__main__":
    flower_truth = {
        "Iris-setosa": 1,
        "Iris-versicolor": 2,
        "Iris-virginica": 3
    }
    x, y = read_iris_dataset("../datasets/iris.data", flower_truth)
    print x, y

    prob = svm.svm_problem(y, x)
    param = svm.svm_parameter('-t 0 -c 4 -b 1')
    m = svmutil.svm_train(prob, param)

    p = svmutil.svm_predict(y, x, m)
    print "DONE"
Exemplo n.º 50
0
    def rank(self, pos_ids, neg_ids=()):
        """
        Rank the current model, returning a mapping of element IDs to a
        ranking valuation. This valuation should be a probability in the range
        of [0, 1], where 1.0 is the highest rank and 0.0 is the lowest rank.

        :raises RuntimeError: No current model.

        :return: Mapping of ingest ID to a rank.
        :rtype: dict of (int, float)

        :param pos_ids: List of positive data IDs. Required.
        :type pos_ids: list of int

        :param neg_ids: List of negative data IDs. Optional.
        :type neg_ids: list of int

        :return: Mapping of ingest ID to a rank.
        :rtype: dict of (int, float)

        """
        if not self.has_model():
            raise RuntimeError("No model available for this indexer.")

        # Automatically support the negative IDs with the most distance UIDs
        # from the provided positive UIDs.
        # if len(neg_ids) == 0:
        #     neg_ids = self._pick_auto_negatives(pos_ids)
        neg_ids = set(neg_ids).union(self._pick_auto_negatives(pos_ids))

        #
        # SVM model training
        #
        uid_list = sorted(set.union(set(pos_ids), neg_ids))
        feature_len = self._feature_mat.shape[1]
        # positive label: 1, negative label: 0
        bool2label = {1: 1, 0: 0}
        labels = [bool2label[uid in pos_ids] for uid in uid_list]
        train_features = \
            self._feature_mat[list(self._uid2idx_map[uid] for uid in uid_list), :]

        self.log.debug("Creating SVM problem")
        svm_problem = svm.svm_problem(labels, train_features.tolist())
        self.log.debug("Creating SVM model")
        w1_weight = max(1.0, len(neg_ids)/float(len(pos_ids)))
        svm_model = svmutil.svm_train(svm_problem,
                                      self.svm_train_params % w1_weight)
        if svm_model.l == 0:
            raise RuntimeError("SVM Model learning failed")

        # Finding associated clip IDs of trained support vectors
        self.log.debug("Finding clip IDs for support vectors")
        hash2feature_idx = dict([(hash(tuple(f)), r)
                                 for r, f in enumerate(self._feature_mat)])
        svm_sv_idxs = []
        tmp_list = [0] * feature_len
        for r in range(svm_model.nSV[0] + svm_model.nSV[1]):
            for c in range(feature_len):
                tmp_list[c] = svm_model.SV[r][c].value
            svm_sv_idxs.append(hash2feature_idx[hash(tuple(tmp_list))])

        #
        # Platt Scaling for probability ranking
        #

        # Features associated to support vectors in trained model
        self.log.debug("Forming data for Platt Scaling")
        # We need the distances between support vectors to all features
        test_kernel = self._distance_mat[svm_sv_idxs, :]

        weights = numpy.array(svm_model.get_sv_coef()).flatten()
        margins = (numpy.mat(weights) * test_kernel).A[0]

        self.log.debug("Performing Platt scaling")
        rho = svm_model.rho[0]
        probA = svm_model.probA[0]
        probB = svm_model.probB[0]
        #: :type: numpy.core.multiarray.ndarray
        probs = 1.0 / (1.0 + numpy.exp((margins - rho) * probA + probB))

        # Test if the probability of an adjudicated positive is below a
        # threshold. If it is, invert probabilities.
        # * Find lowest ranking positive example
        # * Test if the probability valuation falls in the lower 50% of all
        #   probabilities.
        pos_probs = numpy.array(
            [probs[self._uid2idx_map[uid]] for uid in pos_ids]
        )
        pos_mean_prob = pos_probs.sum() / pos_probs.size
        total_mean_prob = probs.sum() / probs.size
        if pos_mean_prob < total_mean_prob:
            probs = 1.0 - probs

        probability_map = dict(zip(self._uid_array, probs))

        return probability_map
Exemplo n.º 51
0
from pybrain.datasets import ClassificationDataSet
print "Reading data set..."
DS = ClassificationDataSet.loadFromFile('dataset.csv')

#Split validation set
TestDS, TrainDS = DS.splitWithProportion(0.25)

#train svm
from svm import svm_problem, svm_parameter, libsvm, gen_svm_nodearray

#define problem with data from the pybrain dataset.
# best python explanation for libsvm is here: https://github.com/arnaudsj/libsvm/tree/master/python
#we have to convert the data to ints and lists because of the low-level c interface

prob = svm_problem([int(t) for t in TrainDS['target']],
                   [list(i) for i in TrainDS['input']])
param = svm_parameter()
# option: -t 0: linear kernel. Best for classification.
# option: -c 0.01: regularization parameter. smaller is more regularization
# see below for all options
param.parse_options('-t 0 -c 0.01')
print "Training svm..."
model = libsvm.svm_train(prob, param)

print "Testing svm with three random inputs"
from random import randrange
for j in range(3):
    i = randrange(0, len(TestDS))
    #again some conversion needed because of low level interface
    x0, m_idx = gen_svm_nodearray(list(TestDS['input'][i]))
    prediction = libsvm.svm_predict(model, x0)
Exemplo n.º 52
0
 def train(self, features, labels):
     assert isinstance(labels, np.ndarray), "labels should be numpy array"
     features = self._cleanse_features(features)
     problem = svm.svm_problem(labels.tolist(), features)
     self.model = svm.svm_model(problem, self._svm_parameter)
Exemplo n.º 53
0
    def _train(self, class_examples, **extra_params):
        """
        Internal method that trains the classifier implementation.

        This method is called after checking that there is not already a model
        trained, thus it can be assumed that no model currently exists.

        The class labels will have already been checked before entering this
        method, so it can be assumed that the ``class_examples`` will container
        at least two classes.

        :param class_examples: Dictionary mapping class labels to iterables of
            DescriptorElement training examples.
        :type class_examples: dict[collections.abc.Hashable,
                 collections.abc.Iterable[smqtk.representation.DescriptorElement]]

        :param extra_params: Dictionary with extra parameters for training.
            This is not used by this implementation.
        :type extra_params: None | dict[basestring, object]

        """

        # Offset from 0 for positive class labels to use
        # - not using label of 0 because we think libSVM wants positive labels
        CLASS_LABEL_OFFSET = 1

        # Stuff for debug reporting
        param_debug = {'-q': ''}
        if self._log.getEffectiveLevel() <= logging.DEBUG:
            param_debug = {}

        # Form libSVM problem input values
        self._log.debug("Formatting problem input")
        train_labels = []
        train_vectors = []
        train_group_sizes = []  # number of examples per class
        self.svm_label_map = {}
        # Making SVM label assignment deterministic to alphabetic order
        for i, l in enumerate(sorted(class_examples), CLASS_LABEL_OFFSET):
            # Map integer SVM label to semantic label
            self.svm_label_map[i] = l

            self._log.debug('-- class %d (%s)', i, l)
            # requires a sequence, so making the iterable ``g`` a tuple
            g = class_examples[l]
            if not isinstance(g, collections.abc.Sequence):
                self._log.debug('   (expanding iterable into sequence)')
                g = tuple(g)

            train_group_sizes.append(float(len(g)))
            x = numpy.array(DescriptorElement.get_many_vectors(g))
            x = self._norm_vector(x)
            train_labels.extend([i] * x.shape[0])
            train_vectors.extend(x.tolist())
            del g, x

        assert len(train_labels) == len(train_vectors), \
            "Count mismatch between parallel labels and descriptor vectors" \
            "being sent to libSVM (%d != %d)" \
            % (len(train_labels), len(train_vectors))

        self._log.debug("Forming train params")
        #: :type: dict
        params = deepcopy(self.train_params)
        params.update(param_debug)
        # Calculating class weights if set to C-SVC type SVM
        if '-s' not in params or int(params['-s']) == 0:
            # (john.moeller): The weighting should probably be the geometric
            # mean of the number of examples over the classes divided by the
            # number of examples for the current class.
            gmean = scipy.stats.gmean(train_group_sizes)
            for i, n in enumerate(train_group_sizes, CLASS_LABEL_OFFSET):
                w = gmean / n
                params['-w' + str(i)] = w
                self._log.debug("-- class '%s' weight: %s",
                                self.svm_label_map[i], w)

        self._log.debug("Making parameters obj")
        svm_params = svmutil.svm_parameter(self._gen_param_string(params))
        self._log.debug("Creating SVM problem")
        svm_problem = svm.svm_problem(train_labels, train_vectors)
        del train_vectors
        self._log.debug("Training SVM model")
        self.svm_model = svmutil.svm_train(svm_problem, svm_params)
        self._log.debug("Training SVM model -- Done")

        if self.svm_label_map_elem and self.svm_label_map_elem.writable():
            self._log.debug("saving labels to element (%s)",
                            self.svm_label_map_elem)
            self.svm_label_map_elem.set_bytes(
                pickle.dumps(self.svm_label_map, -1)
            )
        if self.svm_model_elem and self.svm_model_elem.writable():
            self._log.debug("saving model to element (%s)",
                            self.svm_model_elem)
            # LibSvm I/O only works with filepaths, thus the need for an
            # intermediate temporary file.
            fd, fp = tempfile.mkstemp()
            try:
                svmutil.svm_save_model(fp, self.svm_model)
                # Use the file descriptor to create the file object.
                # This avoids reopening the file and will automatically
                # close the file descriptor on exiting the with block.
                # fdopen() is required because in Python 2 open() does
                # not accept a file descriptor.
                with os.fdopen(fd, 'rb') as f:
                    self.svm_model_elem.set_bytes(f.read())
            finally:
                os.remove(fp)
Exemplo n.º 54
0
Arquivo: main.py Projeto: 4AZ77mr/NLP
def result(model,
           text_list,
           label_list,
           NFR,
           ratio,
           add_name,
           string,
           Normalization=False):
    global pre_sum, rec_sum, f1_sum
    SKF = StratifiedKFold(n_splits=5, shuffle=True)
    count = 0
    load_model = 'NO'
    if Normalization:
        load_model = 'word2vec'
    for tra_index, te_index in SKF.split(text_list, label_list):
        count += 1
        train_text, test_text = [], []
        train_label, test_label = [], []
        loop(index=tra_index,
             X=text_list,
             y=label_list,
             textlist=train_text,
             labellist=train_label)
        loop(index=te_index,
             X=text_list,
             y=label_list,
             textlist=test_text,
             labellist=test_label)

        train = []
        test = []

        for j in range(len(train_text)):  # data_train ▶︎ train
            train.append(train_text[j][1])

        word_count(model=model,
                   NFR=NFR,
                   text=train,
                   labels=train_label,
                   ratio=ratio,
                   add_name=add_name,
                   string=string,
                   count=count,
                   load_model=load_model)

        for k in range(len(test_text)):  # data_test ▶︎ test
            test.append(test_text[k][1])

        dense_all_test = []
        dense_all_train = []

        dictionary = corpora.Dictionary(train)

        if Normalization == False:
            docs_train = train
            docs_test = test
            siki = 9999
        # elif Normalization == True:
        #     w1 = wordVecMaker(tokens=train, threshold=siki, nfr=NFR, count=count, classify=classification_model, path=add_path)
        #     docs_train = w1.synonimTransfer(sentences=train, synonyms=w1.get_synonym())
        #     # w2 = get_synonym(test, siki)
        #     docs_test = w1.synonimTransfer(sentences=test, synonyms=w1.get_synonym())

        bow_corpus_train = [dictionary.doc2bow(d) for d in docs_train]
        bow_corpus_test = [dictionary.doc2bow(d) for d in docs_test]

        for bow in bow_corpus_train:
            dense = list(
                matutils.corpus2dense([bow], num_terms=len(dictionary)).T[0])
            dense_all_train.append(dense)

        for bow2 in bow_corpus_test:
            dense2 = list(
                matutils.corpus2dense([bow2], num_terms=len(dictionary)).T[0])
            dense_all_test.append(dense2)

        if model == 'SMO':
            prob = svm_problem(train_label, dense_all_train)
            param = svm_parameter("-s 0 -t 0")
            mdl = svmutil.svm_train(prob, param)
            label_predict, accuracy, dec_values = svmutil.svm_predict(
                test_label, dense_all_test, mdl)
        elif model in modelselection:
            clf = modelselection[model]
            clf.fit(dense_all_train, train_label)
            label_predict = clf.predict(dense_all_test)

        pre_score_ = precision_score(test_label, label_predict, average=None)
        rec_score_ = recall_score(test_label, label_predict, average=None)
        f1_score_ = f1_score(test_label, label_predict, average=None)

        pre_sum += pre_score_[1]
        rec_sum += rec_score_[1]
        f1_sum += f1_score_[1]

        df = pd.DataFrame({
            '要件': test,
            '正解': test_label,
            '予測': label_predict
        },
                          columns=['要件', '正解', '予測'])
        dir_path = '実験/NFR分類/予測結果/' + model + '_10/' + load_model + '/' + add_name + '/' + NFR + '/' + str(
            ratio) + '/'
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        df.to_csv(dir_path + '/' + str(siki) + '_' + str(count) + '(厚[' +
                  string + ']).csv')
Exemplo n.º 55
0
    def train(self, positive_classes, negatives):
        """
        Train the supervised SVM classifier model.

        The class label ``negative`` is reserved for the negative class.

        If a model is already loaded, we will raise an exception in order to
        prevent accidental overwrite.

        NOTE:
            This abstract method provides generalized error checking and
            should be called via ``super`` in implementing methods.

        :param positive_classes: Dictionary mapping positive class labels to
            iterables of DescriptorElement training examples.
        :type positive_classes:
            dict[collections.Hashable,
                 collections.Iterable[smqtk.representation.DescriptorElement]]

        :param negatives: Iterable of negative DescriptorElement examples.
        :type negatives: collections.Iterable[smqtk.representation.DescriptorElement]

        :raises ValueError: The ``negative`` label was found in the
            ``positive_classes`` dictionary. This is reserved for the negative
            example class.
        :raises ValueError: There were no positive or negative examples.
        :raises RuntimeError: A model already exists in this instance.Following
            through with training would overwrite this model. Throwing an
            exception for information protection.


        """
        super(LibSvmClassifier, self).train(positive_classes, negatives)

        # Offset from 0 for positive class labels to use
        # - not using label of 0 because we think libSVM wants positive labels
        CLASS_LABEL_OFFSET = 1

        # Stuff for debug reporting
        etm_ri = None
        param_debug = {"-q": ""}
        if self._log.getEffectiveLevel() <= logging.DEBUG:
            etm_ri = 1.0
            param_debug = {}

        # Form libSVM problem input values
        self._log.debug("Formatting problem input")
        train_labels = []
        train_vectors = []
        train_group_sizes = []
        self.svm_label_map = {}
        # Making SVM label assignment deterministic to alphabetic order
        for i, l in enumerate(sorted(positive_classes), CLASS_LABEL_OFFSET):
            # Map integer SVM label to semantic label
            self.svm_label_map[i] = l

            self._log.debug("-- class %d (%s)", i, l)
            # requires a sequence, so making the iterable ``g`` a tuple
            g = positive_classes[l]
            if not isinstance(g, collections.Sequence):
                g = tuple(g)

            train_group_sizes.append(float(len(g)))
            x = elements_to_matrix(g, report_interval=etm_ri)
            x = self._norm_vector(x)
            train_labels.extend([i] * x.shape[0])
            train_vectors.extend(x.tolist())
            del g, x

        self._log.debug("-- negatives (-1)")
        # Map integer SVM label to semantic label
        self.svm_label_map[-1] = self.NEGATIVE_LABEL
        # requires a sequence, so making the iterable ``negatives`` a tuple
        if not isinstance(negatives, collections.Sequence):
            negatives = tuple(negatives)
        negatives_size = float(len(negatives))
        x = elements_to_matrix(negatives, report_interval=etm_ri)
        x = self._norm_vector(x)
        train_labels.extend([-1] * x.shape[0])
        train_vectors.extend(x.tolist())
        del negatives, x

        self._log.debug(
            "Training elements: %d labels, %d vectors " "(should be the same)", len(train_labels), len(train_vectors)
        )

        self._log.debug("Forming train params")
        #: :type: dict
        params = deepcopy(self.train_params)
        params.update(param_debug)
        # Only need to calculate positive class weights when C-SVC type
        if "-s" not in params or int(params["-s"]) == 0:
            for i, n in enumerate(train_group_sizes, CLASS_LABEL_OFFSET):
                params["-w" + str(i)] = max(1.0, negatives_size / float(n))

        self._log.debug("Making parameters obj")
        svm_params = svmutil.svm_parameter(self._gen_param_string(params))
        self._log.debug("Creating SVM problem")
        svm_problem = svm.svm_problem(train_labels, train_vectors)
        self._log.debug("Training SVM model")
        self.svm_model = svmutil.svm_train(svm_problem, svm_params)
        self._log.debug("Training SVM model -- Done")

        if self.svm_label_map_fp:
            self._log.debug("saving file -- labels -- %s", self.svm_label_map_fp)
            with open(self.svm_label_map_fp, "wb") as f:
                cPickle.dump(self.svm_label_map, f)
        if self.svm_model_fp:
            self._log.debug("saving file -- model -- %s", self.svm_model_fp)
            svmutil.svm_save_model(self.svm_model_fp, self.svm_model)
def main(args):
	paramsfn = args[0]
	exec(open(paramsfn,'r').read())
	
	
	if len(args) > 1:
		crange = [float(args[1])]
		gammarange = [float(args[2])]
	
	output,input,fieldnames,fold_inds = load_data(datafilename,use_specific_fold_inds)
	fold_start = [-1]
	if use_specific_fold_inds:
		unique_fold_ids = unique(fold_inds)
		row_inds = []
		outputcopy = []
		inputcopy = zeros([size(input,0),size(input,1)],dtype='float64')
		fold_start = [0]
		
		curind = 0
		for ind in unique_fold_ids:
			row_inds = [i for i in xrange(len(fold_inds)) if fold_inds[i] == ind]
			inputcopy[curind:curind+len(row_inds),:] = input[row_inds,:]
			outputcopy.extend([output[i] for i in row_inds])
			curind += len(row_inds)
			
			fold_start.append(fold_start[-1]+len(row_inds))
		input = inputcopy
		output = outputcopy
		nf = len(fold_start)-1
	
	if testdatafilename != '':
		output_test,input_test,fieldnames,fold_inds_test = load_data(testdatafilename,False)
	
	if doscale:
		maxinput = input.max(0);
		mininput = input.min(0);
		input = (input-mininput)/(maxinput-mininput)
		
		if testdatafilename != '':
			input_test = (input_test-mininput)/(maxinput-mininput)

		if savemodel:
			save_scale_data(datafilename+'_scales.dat',maxinput,mininput)

	if donormalize:
		means = input.mean(0)
		stds = sqrt(input.var(0))
		input = (input-means)/stds
		if testdatafilename != '':
			input_test = (input_test-means)/stds

		if savemodel:
			save_zscore_data(datafilename+'_meansstdevs.dat',means,stds)

			
	if numcpus == 'auto':
		p = Pool()
	else:
		if numcpus == 1:
			p = ''
		else:
			p = Pool(numcpus)
	
	if choose_specific_features:
		for specific_selected_choice in specific_selected_features:
			inputfiltered = input[:,specific_selected_choice]
			
			with Timer():
				if use_specific_fold_inds:
					results = mygrid.grid_classify_multi (crange,gammarange,output,[list(x) for x in inputfiltered],nf,useprob,timeout,p,fold_start)				
				else:
					results = mygrid.grid_classify_multi (crange,gammarange,output,[list(x) for x in inputfiltered],nf,useprob,timeout,p)
				
			param = svm.svm_parameter('-c %g -g %g -b %d' % (results[-2],results[-1],int(useprob)))
			prob = svm.svm_problem(output, [list(x) for x in inputfiltered])
			target = (c_double * prob.l)()
			fold_start_p = (c_int *len(fold_start))()
			for i in xrange(len(fold_start)):
				fold_start_p[i] = fold_start[i]
			
			libsvm.svm_cross_validation_labeltargets(prob, fold_start_p,param, nf, target)
			labels = unique(output)
			ACC,confusionmatrix = mygrid.evaluations_classify_multi(output, target,labels)
			
			probchance = 0
			N = len(output)
			for i in xrange(len(labels)):
				nums_per_class_pred =sum(confusionmatrix[:,i])
				probchance += (sum(confusionmatrix[:,i])*sum(confusionmatrix[i,:]))*1.0/(N*N)

			kappa = (ACC/100-probchance)*1.0/(1-probchance);
				

			print 'Optimal gamma = %g\nOptimal c = %g' % (results[-1],results[-2])
			print 'Top CV ACC = %g' % (ACC)
			print 'Top CV kappa = %g' % (kappa)
			sys.stdout.write('=======')
			for i in xrange(len(labels)):
				sys.stdout.write('=========')
			print '=========='
			print '||   ||',
			for i in xrange(len(labels)):
				print '%6d |' % labels[i],
			print '       ||'
			sys.stdout.write('||=====')
			for i in xrange(len(labels)):
				sys.stdout.write('=========')
			print '========||'
			for i in xrange(len(labels)):
				print '||%3d||' % labels[i],
				for j in xrange(len(labels)):
					print '%6g |' % confusionmatrix[i,j],
				print '%6g ||' % sum(confusionmatrix[i,:])
			sys.stdout.write('||-----')
			for i in xrange(len(labels)):
				sys.stdout.write('---------')
			print '--------||'

			print '||   ||',
			for i in xrange(len(labels)):
				print '%6g |' % sum(confusionmatrix[:,i]),
			print '%6g ||' % N
			sys.stdout.write('=======')
			for i in xrange(len(labels)):
				sys.stdout.write('=========')
			print '=========='
			
			
			
			if savemodel:
				param = ('-c %g -g %g -b %d' % (results[-2],results[-1],int(useprob)))
				m = svm_train(output,[list(x) for x in inputfiltered],param)
				svm_save_model(datafilename + '.model',m)
				
								
			if outputlog != '':
				fout = open(outputlog,'a')
				print >> fout, '========================='
				print >> fout, datafilename
				print >> fout, doscale, donormalize, dopca, '(scale/norm/pca)'
				print >> fout, crange[0],crange[-1], gammarange[0], gammarange[-1], '(cs,gammas)'
				print >> fout, use_specific_fold_inds, nf, '(use specific folds, numfold)'
				print >> fout, 'SPECIFIC FIELDS:'
				print >> fout, specific_selected_choice
				if fieldnames != []:
					for i in specific_selected_choice:
						print >> fout, fieldnames[i],
					print >> fout
				print >> fout, 'train: '
				print >> fout, '    ACC=%g,kappa=%g (g=%g,c=%g)' % (ACC,kappa,results[-1],results[-2])
				fout.write('    =======')
				for i in xrange(len(labels)):
					fout.write('=========')
				print >> fout, '=========='
				print >> fout, '    ||   ||',
				for i in xrange(len(labels)):
					print >> fout, '%6d |' % labels[i],
				print >> fout, '       ||'
				fout.write('    ||=====')
				for i in xrange(len(labels)):
					fout.write('=========')
				print >> fout, '========||'
				for i in xrange(len(labels)):
					print >> fout, '    ||%3d||' % labels[i],
					for j in xrange(len(labels)):
						print >> fout, '%6g |' % confusionmatrix[i,j],
					print >> fout, '%6g ||' % sum(confusionmatrix[i,:])
				fout.write('    ||-----')
				for i in xrange(len(labels)):
					fout.write('---------')
				print >> fout, '--------||'
	
				print >> fout, '    ||   ||',
				for i in xrange(len(labels)):
					print >> fout, '%6g |' % sum(confusionmatrix[:,i]),
				print >> fout, '%6g ||' % N
				fout.write('    =======')
				for i in xrange(len(labels)):
					fout.write('=========')
				print >> fout, '=========='
				fout.close()
		
			if testdatafilename != '':
				inputfiltered_test = input_test[:,specific_selected_choice]
				param = ('-c %g -g %g -b %d' % (results[-2],results[-1],int(useprob)))
				m = svm_train(output,[list(x) for x in inputfiltered],param)	
				pred_labels, (ACC, MSE, SCC), pred_values = svm_predict(output_test,[list(x) for x in inputfiltered_test],m,'-b %d' % (int(useprob)))
				labels = m.get_labels()
				ACC,confusionmatrix = mygrid.evaluations_classify_multi(output_test, pred_labels, labels)
								
				probchance = 0
				N = len(output_test)
				for i in xrange(len(labels)):
					nums_per_class_pred =sum(confusionmatrix[:,i])
					probchance += (sum(confusionmatrix[:,i])*sum(confusionmatrix[i,:]))*1.0/(N*N)

				kappa = (ACC/100-probchance)*1.0/(1-probchance);
					
				print 'Test optimized accuracy = %g' % (ACC)
				print 'Test optimized kappa = %g' % (kappa)
				sys.stdout.write('=======')
				for i in xrange(len(labels)):
					sys.stdout.write('=========')
				print '=========='
				print '||   ||',
				for i in xrange(len(labels)):
					print '%6d |' % labels[i],
				print '       ||'
				sys.stdout.write('||=====')
				for i in xrange(len(labels)):
					sys.stdout.write('=========')
				print '========||'
				for i in xrange(len(labels)):
					print '||%3d||' % labels[i],
					for j in xrange(len(labels)):
						print '%6g |' % confusionmatrix[i,j],
					print '%6g ||' % sum(confusionmatrix[i,:])
				sys.stdout.write('||-----')
				for i in xrange(len(labels)):
					sys.stdout.write('---------')
				print '--------||'

				print '||   ||',
				for i in xrange(len(labels)):
					print '%6g |' % sum(confusionmatrix[:,i]),
				print '%6g ||' % N
				sys.stdout.write('=======')
				for i in xrange(len(labels)):
					sys.stdout.write('=========')
				print '=========='

			
	else:
		
		with Timer():
			results = mygrid.grid_classify_multi (crange,gammarange,output,[list(x) for x in input],nf,useprob,timeout,p)

		param = svm.svm_parameter('-c %g -g %g -b %d' % (results[-2],results[-1],int(useprob)))
		prob = svm.svm_problem(output, [list(x) for x in input])
		target = (c_double * prob.l)()
		labels = unique(output)
		print 'Optimal gamma = %g\nOptimal c = %g' % (results[-1],results[-2])
		libsvm.svm_cross_validation_labeltargets(prob, param, nf, target)
		ACC,confusionmatrix = mygrid.evaluations_classify_multi(output, target,labels)
		print 'Top CV ACC = %g' % (ACC)

		sys.stdout.write('=======')
		for i in xrange(len(labels)):
			sys.stdout.write('=========')
		print '=========='
		print '||   ||',
		for i in xrange(len(labels)):
			print '%6d |' % labels[i],
		print '       ||'
		sys.stdout.write('||=====')
		for i in xrange(len(labels)):
			sys.stdout.write('=========')
		print '========||'
		for i in xrange(len(labels)):
			print '||%3d||' % labels[i],
			for j in xrange(len(labels)):
				print '%6g |' % confusionmatrix[i,j],
			print '%6g ||' % sum(confusionmatrix[i,:])
		sys.stdout.write('||-----')
		for i in xrange(len(labels)):
			sys.stdout.write('---------')
		print '--------||'

		print '||   ||',
		for i in xrange(len(labels)):
			print '%6g |' % sum(confusionmatrix[:,i]),
		print '%6g ||' % len(output)
		sys.stdout.write('=======')
		for i in xrange(len(labels)):
			sys.stdout.write('=========')
		print '=========='
			
		
		del target
		
		if savemodel:
			param = ('-c %g -g %g -b %d' % (results[-2],results[-1],int(useprob)))
			m = svm_train(output,[list(x) for x in input],param)
			svm_save_model(datafilename+'.model',m)
		
		if testdatafilename != '':
			param = ('-c %g -g %g -b %d' % (results[-2],results[-1],int(useprob)))
			m = svm_train(output,[list(x) for x in input],param)

			pred_labels, (ACC, MSE, SCC), pred_values = svm_predict(output_test,[list(x) for x in input_test],m,'-b %d' % (int(useprob)))
			labels = m.get_labels()
			ACC,confusionmatrix = mygrid.evaluations_classify_multi(output_test, pred_labels, labels)

			print 'Test optimized accuracy = %g' % (ACC)
			sys.stdout.write('=======')
			for i in xrange(len(labels)):
				sys.stdout.write('=========')
			print '=========='
			print '||   ||',
			for i in xrange(len(labels)):
				print '%6d |' % labels[i],
			print '       ||'
			sys.stdout.write('||=====')
			for i in xrange(len(labels)):
				sys.stdout.write('=========')
			print '========||'
			for i in xrange(len(labels)):
				print '||%3d||' % labels[i],
				for j in xrange(len(labels)):
					print '%6g |' % confusionmatrix[i,j],
				print '%6g ||' % sum(confusionmatrix[i,:])
			sys.stdout.write('||-----')
			for i in xrange(len(labels)):
				sys.stdout.write('---------')
			print '--------||'

			print '||   ||',
			for i in xrange(len(labels)):
				print '%6g |' % sum(confusionmatrix[:,i]),
			print '%6g ||' % len(output_test)
			sys.stdout.write('=======')
			for i in xrange(len(labels)):
				sys.stdout.write('=========')
			print '=========='

		
		if outputlog != '':
			fout = open(outputlog,'a')
			print >> fout, results#[:-1]
#			for key in results[-1].keys():
#				print >> fout, key, results[-1][key]
			fout.close()
Exemplo n.º 57
0
	for i, fileName in enumerate( ['./dataset2/avon.csv', './dataset2/brian_merge.csv', './dataset2/mon_merge.csv', './dataset2/nofar_merge.csv'] ):
		tmp = readDataset(fileName) # array of Instance
		dataSet = dataSet + tmp
		print 'size:', len(tmp)
		label = label + [i]*len(tmp)
		dataSet, label = shuffle(dataSet, label, random_state=0)
	cutIndex = int(TRAIN_SET_RATIO*len(dataSet))
	## use accel_abs and alpha_abs as input for encoding respectively
	print 'learning dictionary'
	data_accel = [I.accel_abs() for I in dataSet]
	data_alpha = [I.alpha_abs() for I in dataSet]
	RPDictionary_accel = Dictionary(PATCH_SIZE, data_accel[:cutIndex])
	RPDictionary_alpha = Dictionary(PATCH_SIZE, data_alpha[:cutIndex])
	aggregate_feature = [ f[0]+f[1] for f in zip( RPDictionary_accel.encoding(data_accel), RPDictionary_alpha.encoding(data_alpha) ) ]
	#aggregate_feature = preprocessing.scale(aggregate_feature) ## scale columns independently to have zero mean and unit variance

	writeFeature('./svm_train', aggregate_feature[:cutIndex], label[:cutIndex]) 
	writeFeature('./svm_test', aggregate_feature[cutIndex:], label[cutIndex:]) 

	## SVM training
	X_train, Y_train = readFeature('./svm_train',PATCH_SIZE*2)
	prob = svm_problem(Y_train, X_train)
	param = svm_parameter('-t 1 -q -d 2')
	model = svm_train(prob, param)

	## SVM predicting
	X_test, Y_test = readFeature('./svm_test',PATCH_SIZE*2)
	p_labels, p_acc, p_vals = svm_predict(Y_test, X_test, model)
	print p_acc	
	print confusion_matrix(Y_test, p_labels)
Exemplo n.º 58
0
#!/usr/bin/python
# encoding: utf-8
import svm
import svmutil

print 'test'

y  = [13, 9, 7]
x  = [[1,1,1],[1,0,1],[1,1,0]] #データ
prob = svm.svm_problem(y, x)#訓練データ
param = svm.svm_parameter()
param.kernel_type = svm.LINEAR
param.C = 10

m = svmutil.svm_train(prob, param)#学習
#新しいクラスに対する予測
#res = svmutil.svm_predict([1], [[1,1,1]], m)
res = svmutil.svm_predict([3],[[1,0,0]], m)
print res