def __init__(self, data_dictionary, model_target, kernel=LINEAR, cv_segments=10, **args): #Create an SVM model object #Check to see if a threshold has been specified in the function's arguments try: self.threshold = args['threshold'] except KeyError: self.threshold=2.3711 # if there is no 'threshold' key, then use the default (2.3711) #Store some object data model_dict = deepcopy(data_dictionary) self.model_target = model_target self.folds = cv_segments #Label the exceedances in the training set. model_dict[model_target] = self.Assign_Labels(model_dict[model_target]) #Extract the training labels and training set self.training_labels = model_dict.pop(model_target) self.training_set = np.transpose(model_dict.values()) self.headers = model_dict.keys() #Scale the covariates to [-1,1] self.Scale_Covariates() #Generate an SVM model. self.svm_problem = svm.svm_problem(self.training_labels, self.training_set) self.svm_params = {'kernel_type' : kernel, 'weight_label' : [0,1], 'weight' : [10,1]} self.model=svm.svm_model(self.svm_problem, svm.svm_parameter(**self.svm_params)) #Use cross-validation to find the best number of components in the model. self.Select_Linear_Model(-5, 10) #Rebuild the model, calculating the probabilities of class membership self.svm_params['probability']=1 self.model=svm.svm_model(self.svm_problem, svm.svm_parameter(**self.svm_params))
def constructSVMModels(self, db_250k, arrays_to_form_model, array_id2median_intensity,\ minPercUnCoveredByLerContig=0.6, cnv_method_id=6, kernel_type=None, C=10, gamma=0., \ eps=1e-2, deletedFractionType=1): """ 2010-7-25 add argument deletedFractionType 1: CNVCall.percUnCoveredByLerContig 2: CNVCall.fractionDeletedInPECoverageData 2010-7-1 """ sys.stderr.write("Constructing SVM models for %s arrays ...\n"%(len(arrays_to_form_model))) from svm import svm_problem, svm_parameter, svm_model, cross_validation, LINEAR, POLY, RBF if kernel_type is None: kernel_type = RBF param = svm_parameter(C = C, eps=eps, probability = 1, gamma=gamma, kernel_type = kernel_type) array_id2model = {} for array_id in arrays_to_form_model: if array_id not in array_id2median_intensity: #model array has to be in array_id2median_intensity continue cnvFeatureData = self.getCNVFeatureData(db_250k, array_id=array_id, \ minPercUnCoveredByLerContig=minPercUnCoveredByLerContig, cnv_method_id=cnv_method_id, \ replaceAmpWithMedianIntensity=False, deletedFractionType=deletedFractionType) problem = svm_problem(cnvFeatureData.class_label_ls, cnvFeatureData.feature_data) model = svm_model(problem, param) array_id2model[array_id] = model sys.stderr.write("%s models.\n"%(len(array_id2model))) return array_id2model
def train(self, examples, parameters=None): self.isBinary = self.isBinaryProblem(examples) examples = self.filterTrainingSet(examples) ExampleUtils.writeExamples(examples, self.tempDir + "/train.dat") #prepare parameters: if parameters.has_key("c"): assert (not parameters.has_key("C")) parameters["C"] = parameters["c"] del parameters["c"] totalExamples = float(sum(self.classes.values())) weight_label = self.classes.keys() weight_label.sort() weight = [] for k in weight_label: weight.append(1.0 - self.classes[k] / totalExamples) libSVMparam = svm.svm_parameter(nr_weight=len(self.classes), weight_label=weight_label, weight=weight, **parameters) labels = [] samples = [] for example in examples: labels.append(example[1]) samples.append(example[2]) problem = svm.svm_problem(labels, samples) self.model = svm.svm_model(problem, libSVMparam)
def load_model(self, file_name=''): """ Loads the svm model from the given file. """ file_name = file_name or (name(self) + '.model') self.model = svm.svm_model(file_name) super(SvmLearner, self).load_model(file_name)
def train(self,labels,data): ''' Train the classifier. @param labels: A list of class labels. @param data: A 2D array or list of feature vectors. One feature vector per row. ''' # Check the types and convert to np arrays if isinstance(data,list) or isinstance(data,tuple): data = np.array(data,dtype=np.double) labels = np.array(labels,dtype=np.double) # Preprocess the data labels,data = self._preprocessor.train(labels,data) labels,data = self._label_scale.train(labels,data) # Create the svm parameter data and problem description param = svm.svm_parameter(svm_type=svm.EPSILON_SVR,kernel_type = svm.RBF, p = self._epsilon, gamma=self._gamma) prob = svm.svm_problem(labels.tolist(),data.tolist()) # train the svm self._model = svm.svm_model(prob, param)
def train(self, labels, data): ''' Train the classifier. @param labels: A list of class labels. @param data: A 2D array or list of feature vectors. One feature vector per row. ''' # Check the types and convert to np arrays if isinstance(data, list) or isinstance(data, tuple): data = np.array(data, dtype=np.double) labels = np.array(labels, dtype=np.double) # Preprocess the data labels, data = self._preprocessor.train(labels, data) labels, data = self._label_scale.train(labels, data) # Create the svm parameter data and problem description param = svm.svm_parameter(svm_type=svm.EPSILON_SVR, kernel_type=svm.RBF, p=self._epsilon, gamma=self._gamma) prob = svm.svm_problem(labels.tolist(), data.tolist()) # train the svm self._model = svm.svm_model(prob, param)
def _stop_training(self): """ Trains and creates the model. """ # reset variables self.problems = [] self.models = [] self.labels = [] self._dosim = numpy.zeros(self._output_dim, dtype='int') # finally generate the models for n in range(self._output_dim): # get labels (min,max) self.labels.append((self.Y[:, n].max(), self.Y[:, n].min())) if self.labels[n][0] == self.labels[n][1]: # apport simulation if there is only one label self._dosim[n] = 1 self.problems.append(None) self.models.append(None) continue # construct problems self.problems.append(svm.svm_problem(self.Y[:, n], self.X)) # generate models self.models.append(svm.svm_model(self.problems[n], self.parameters)) # check if there are only 2 classes if self.models[n].get_nr_class() > 2: raise mdp.NodeException( "Only binary classification possible with libsvm for now !" ) # reset data for training self.reset_model()
def load(self): #Check to see if learner already exists if '%s_learner'% self.className not in os.listdir('.'): print 'Note: %s learner does not exist yet' % self.className return else: print '%s_learner' % self.className #Update the model self.models.append(svm.svm_model('%s_learner' % self.className)) #Update the labeled data temp = pd.read_csv('%s_learner.csv' % self.className) temp.rename(columns={temp.columns[0]:'index'},inplace=True) temp.set_index('index',inplace=True) text_file = open('%s_learner.txt' % self.className, "r") #Print most recent accuracy details = eval(text_file.read()) print 'Last accuracy: %s' % details['confusion_matrix'] self.nbc = details['NBC'] #Convert everything back to a dataframe for el in details.keys(): try: details[el] = pd.DataFrame(details[el]) except: continue #Update most recent test results self.test_results = details self.unlabeled_datasets.data = pd.concat([temp.ix[[el for el in temp.index if el not in self.unlabeled_datasets.data.index]],self.unlabeled_datasets.data])
def train(self, c, g, probability=True, compensation=True, path=None, filename=None, save=True): if filename is None: filename = os.path.splitext(self.getOption('strArffFileName'))[0] filename += '.model' if path is None: path = self.dctEnvPaths['data'] param = svm.svm_parameter(kernel_type=svm.RBF, C=c, gamma=g, probability=1 if probability else 0) labels, samples = self.getData(normalize=True) # because we train the SVM with dict we need to redefine the zero-insert self.hasZeroInsert = False if not self.oClassifier is None: self.oClassifier.setOption('hasZeroInsert', True) if compensation: weight, weight_label = self._calculateCompensation(labels) param.weight = weight param.weight_label = weight_label param.nr_weight = len(weight) problem = svm.svm_problem(labels, samples) model = svm.svm_model(problem, param) if save: model.save(os.path.join(path, filename)) return problem, model
def train(self, search=False, **kwargs): """ Train the SVM on the dataset. For RBF kernels (the default), an optional meta-parameter search can be performed. @param search: optional name of grid search class to use for RBF kernels: 'GridSearch' or 'GridSearchDOE' @param log2g: base 2 log of the RBF width parameter @param log2C: base 2 log of the slack parameter @param searchlog: filename into which to dump the search log @param others: ...are passed through to the grid search and/or libsvm """ self.setParams(**kwargs) problem = svm_problem(self.ds['target'].flatten(), self.ds['input'].tolist()) if search: # this is a bit of a hack... model = eval( search + "(problem, self.svmtarget, cmin=[0,-7],cmax=[25,1], cstep=[0.5,0.2],plotflag=self.plot,searchlog=self.searchlog,**self.params)" ) else: param = svm_parameter(**self.params) model = svm_model(problem, param) logging.info("Training completed with parameters:") logging.info(repr(param)) self.svm.setModel(model)
def train(self, c, g, probability=True, compensation=True, path=None, filename=None, save=True): if filename is None: filename = splitext(self.arff_file)[0] filename += '.model' if path is None: path = self.data_dir param = svm.svm_parameter(kernel_type=svm.RBF, C=c, gamma=g, probability=1 if probability else 0) labels, samples = self.getData(normalize=True) # because we train the SVM with dict we need to redefine the zero-insert self.has_zero_insert = False if not self.classifier is None: self.classifier.setOption('hasZeroInsert', True) if compensation: weight, weight_label = self._calculateCompensation(labels) param.weight = weight param.weight_label = weight_label param.nr_weight = len(weight) problem = svm.svm_problem(labels, samples) model = svm.svm_model(problem, param) if save: model.save(os.path.join(path, filename)) return problem, model
def svm(y,K,**param_kw): """ Solve the SVM problem. Return ``(alpha, b)`` `y` labels `K` precopmuted kernel matrix Additional keyword arguments are passed on as svm parameters to the model. The wrapper is needed to precondition the precomputed matrix for use with libsvm, and to extract the model parameters and convert them into the canonical weight vector plus scalar offset. Normally libsvm hides these model paramters, preferring instead to provide a high-level model object that can be queried for results. """ i = arange(1,len(K)+1).reshape((-1,1)) X = hstack((i, K)) y = asarray(y,dtype=double) X = asarray(X,dtype=double) prob = svm_problem(y,X) param = svm_parameter(kernel_type=PRECOMPUTED,**param_kw) model = svm_model(prob, param) return get_alpha_b(model)
def predict_post(): """ Predcition page @@@ # args | args | nullable | type | remark | |--------|--------|--------|--------| | company | false | string | stock symbol | | date length | false | int | the date length for prediction | # return | return | type | remark | |--------|--------|--------| | pred | list | predicted results | @@@ """ form = predictForm() strategy=request.form['strategy'] company = request.form['company'] length = request.form['length'] if strategy == 'bayes': pred = bayes_model(company, int(length)) elif strategy == 'svm': pred = svm_model(company, int(length)) pred=pred.tolist() return render_template('predict.html',form=form,pred=pred,dynamic=time.time(),strategy=strategy)
def train(self, session, doc): # doc here is [[class,...], [{vector},...]] (labels, vectors) = doc.get_raw(session) problem = svm.svm_problem(labels, vectors) self.model = svm.svm_model(problem, self.param) modelPath = self.get_path(session, 'modelPath') self.model.save(str(modelPath)) self.predicting = 1
def train(self, dataset): """ Trains the svm classifier. Converts words to real numbers for training as SVM expects only numbers. """ super(SvmLearner, self).train(dataset) prob = svm.svm_problem(self.results, self.observations) param = svm.svm_parameter(kernel_type=svm.LINEAR, C=10, probability=1) self.model = svm.svm_model(prob, param)
def Select_Linear_Model(self, C_min=-10, C_steps=11): #Search for the model parameters that give the smallest CV error C = self.__Linear_Search__(C_min, C_steps, 1) C = self.__Linear_Search__(np.log2(C)-2, 50, 0.08) C = self.__Linear_Search__(np.log2(C)-0.5, 50, 0.02) self.svm_params['C'] = C self.model = svm.svm_model(self.svm_problem, svm.svm_parameter(**self.svm_params))
def train(self,trainset): """ Trains the SVM. """ self.n_classes = len(trainset.metadata['targets']) # Set LIBSVM parameters kernel_types = {'linear':libsvm.LINEAR,'polynomial':libsvm.POLY, 'rbf':libsvm.RBF,'sigmoid':libsvm.SIGMOID} if self.kernel not in kernel_types: raise ValueError('Invalid kernel: '+self.kernel+'. Should be either \'linear\', \'polynomial\', \'rbf\' or \'sigmoid\'') if self.label_weights != None: class_to_id = trainset.metadata['class_to_id'] nr_weight = self.n_classes weight_label = range(self.n_classes) weight = [1]*self.n_classes for k,v in self.label_weights.iteritems(): weight[class_to_id[k]] = v else: nr_weight = 0 weight_label = [] weight = [] libsvm_params = libsvm.svm_parameter(svm_type = libsvm.C_SVC, kernel_type = kernel_types[self.kernel], degree=self.degree, gamma=self.gamma, coef0=self.coef0, C=self.C, probability=int(self.output_probabilities), cache_size=self.cache_size, eps=self.tolerance, shrinking=int(self.shrinking), nr_weight = nr_weight, weight_label = weight_label, weight = weight) # Put training set in the appropriate format: # if is sparse (i.e. a pair), inputs are converted to dictionaries # if not, inputs are assumed to be sequences and are kept intact libsvm_inputs = [] libsvm_targets = [] for input,target in trainset: if type(input) == tuple: libsvm_inputs += [dict(zip(input[1],input[0]))] else: libsvm_inputs += [input] libsvm_targets += [float(target)] # LIBSVM requires double-valued targets libsvm_problem = libsvm.svm_problem(libsvm_targets,libsvm_inputs) # Train SVM self.svm = libsvm.svm_model(libsvm_problem,libsvm_params)
def Select_Model(self, C_min=-10, C_steps=11, gamma_min=-15, gamma_steps=16): #Search for the model parameters that give the smallest CV error (C, gamma) = self.__Search__(C_min, C_steps, gamma_min, gamma_steps, 1, 1) #(C, gamma) = self.__Search__(np.log2(C)-5, 100, np.log2(gamma)-5, 100, 0.1, 0.1) (C, gamma) = self.__Search__(np.log2(C)-1, 100, np.log2(gamma)-1, 100, 0.02, 0.02) #(C, gamma) = self.__Search__(np.log2(C)-0.5, 100, np.log2(gamma)-0.5, 100, 0.01, 0.01) self.svm_params['C'] = C self.svm_params['gamma'] = gamma self.model = svm.svm_model(self.svm_problem, svm.svm_parameter(**self.svm_params))
def __setstate__(self,state): '''This function is neccessary for pickling''' # Translate everything but the svm because that cannot be simply pickled. for key,value in state.iteritems(): if key == 'svm': filename = tempfile.mktemp() open(filename,'w').write(value) self.svm = svm.svm_model(filename) os.remove(filename) continue self.__dict__[key] = value
def __setstate__(self, state): '''This function is neccessary for pickling''' # Translate everything but the svm because that cannot be simply pickled. for key, value in state.iteritems(): if key == 'svm': filename = tempfile.mktemp() open(filename, 'w').write(value) self.svm = svm.svm_model(filename) os.remove(filename) continue self.__dict__[key] = value
def readmodel(model): """Reads the model and parameters for the given model name. Returns (model, simmeths)""" if model not in MODELS: raise web.notfound('No model %s. Choices are: %s' % (model, ', '.join(MODELS))) modelfname = model+'.model' from svm import svm_model t1 = time.time() model = svm_model(modelfname) f = open(modelfname.replace('.model', '.params')) model.scales = eval(f.readline().strip()) simmeths = eval(f.readline().strip()) f.close() log('Loaded verification model for %s from %s with %d dims and simmeths %s in %0.3f secs' % (model, modelfname, len(model.scales), simmeths, time.time()-t1)) return (model, simmeths)
def __init__(self, data_dir, svm_prefix, has_zero_insert): super(LibSvmClassifier, self).__init__() self.data_dir = data_dir self.svm_prefix = svm_prefix self.has_zero_insert = has_zero_insert model_path = join(data_dir, svm_prefix + '.model') if os.path.isfile(model_path): self.logger.info("Loading libSVM model file '%s'." % model_path) self.svm_model = svm_model(model_path) else: raise IOError("libSVM model file '%s' not found!" % model_path) range_file = join(data_dir, svm_prefix + '.range') if isfile(range_file): self.logger.info("Loading libSVM range file '%s'." % range_file) self.normalizer = Normalizer(range_file) else: raise IOError("libSVM range file '%s' not found!" % range_file) self.probability = True if self.svm_model.probability == 1 else False
def __init__(self, data_dir, svm_prefix, has_zero_insert): super(LibSvmClassifier, self).__init__() self.data_dir = data_dir self.svm_prefix = svm_prefix self.has_zero_insert = has_zero_insert model_path = join(data_dir, svm_prefix + ".model") if os.path.isfile(model_path): self.logger.info("Loading libSVM model file '%s'." % model_path) self.svm_model = svm_model(model_path) else: raise IOError("libSVM model file '%s' not found!" % model_path) range_file = join(data_dir, svm_prefix + ".range") if isfile(range_file): self.logger.info("Loading libSVM range file '%s'." % range_file) self.normalizer = Normalizer(range_file) else: raise IOError("libSVM range file '%s' not found!" % range_file) self.probability = True if self.svm_model.probability == 1 else False
def train(self, search=False, **kwargs): """ Train the SVM on the dataset. For RBF kernels (the default), an optional meta-parameter search can be performed. :key search: optional name of grid search class to use for RBF kernels: 'GridSearch' or 'GridSearchDOE' :key log2g: base 2 log of the RBF width parameter :key log2C: base 2 log of the slack parameter :key searchlog: filename into which to dump the search log :key others: ...are passed through to the grid search and/or libsvm """ self.setParams(**kwargs) problem = svm_problem(self.ds['target'].flatten(), self.ds['input'].tolist()) if search: # this is a bit of a hack... model = eval(search + "(problem, self.svmtarget, cmin=[0,-7],cmax=[25,1], cstep=[0.5,0.2],plotflag=self.plot,searchlog=self.searchlog,**self.params)") else: param = svm_parameter(**self.params) model = svm_model(problem, param) logging.info("Training completed with parameters:") logging.info(repr(param)) self.svm.setModel(model)
def train(self, examples, parameters=None): self.isBinary = self.isBinaryProblem(examples) examples = self.filterTrainingSet(examples) ExampleUtils.writeExamples(examples, self.tempDir+"/train.dat") #prepare parameters: if parameters.has_key("c"): assert(not parameters.has_key("C")) parameters["C"] = parameters["c"] del parameters["c"] totalExamples = float(sum(self.classes.values())) weight_label = self.classes.keys() weight_label.sort() weight = [] for k in weight_label: weight.append(1.0-self.classes[k]/totalExamples) libSVMparam = svm.svm_parameter(nr_weight = len(self.classes), weight_label=weight_label, weight=weight, **parameters) labels = [] samples = [] for example in examples: labels.append(example[1]) samples.append(example[2]) problem = svm.svm_problem(labels, samples) self.model = svm.svm_model(problem, libSVMparam)
def bench_svm(X, Y, T): """ bench with swig-generated wrappers that come with libsvm """ import svm X1 = X.tolist() Y1 = Y.tolist() T1 = T.tolist() gc.collect() # start time tstart = datetime.now() problem = svm.svm_problem(Y1, X1) param = svm.svm_parameter(svm_type=0, kernel_type=0) model = svm.svm_model(problem, param) for i in T.tolist(): model.predict(i) delta = (datetime.now() - tstart) # stop time svm_results.append(delta.seconds + delta.microseconds/mu_second)
def load_model(self, session, path): try: self.model = svm.svm_model(path.encode('utf-8')) self.predicting = 1 except: raise ConfigFileException(path)
from numpy import * # a two-class problem #labels = array([0., 1., 1., 2.]) labels = array([-1, 1, 1, -1]) samples = array([[0., 0.], [0., 1.], [1., 0.], [1., 1.]]) # set the parameters of the SVM param = svm.svm_parameter(kernel_type=svm.LINEAR, C=10) param.kernel_type = svm.RBF # svm_problem is used to hold the training data for the problem prob = svm.svm_problem(labels, samples) # now construct the model model = svm.svm_model(prob, param) print "Number of classes:", model.get_nr_class() # predict one new sample with the model: #testdata = array([1., 0.]) testdata = array([[1., 0.], [1., 1.], [0., 0.], [0., 1.]]) for data in testdata: print "One Prediction: ", model.predict(data) print "Desicion Values of the Prediction: ", model.predict_values( data) #[(1,-1)] # print "Probability of the Prediction: ",model.predict_probability( data ) print "---------LIBLINEAR----------------" class1 = [ll.vector2sparse(samples[0]), ll.vector2sparse(samples[1])] class2 = [ll.vector2sparse(samples[2]), ll.vector2sparse(samples[3])]
def train_SVR_Linear(self, labels, vectors, verbose, C_range, callback=None): '''Private use only''' # combine the labels and vectors into one set. data = [] for i in range(len(labels)): data.append([labels[i], vectors[i]]) #shuffle the data rng = random.Random() if self.random_seed != None: rng.seed(self.random_seed) rng.shuffle(data) # partition into validation and training if type( self.validation_size ) == float and self.validation_size > 0.0 and self.validation_size < 1.0: training_cutoff = int(len(data) * (1.0 - self.validation_size)) elif type(self.validation_size ) == int and self.validation_size < len(labels): training_cutoff = len(labels) - self.validation_size else: raise NotImplementedError( "Cannot determine validation set from %s" % self.validation_size) if verbose: print "Training Cutoff:", len(labels), training_cutoff training_data = data[:training_cutoff] validation_data = data[training_cutoff:] tmp_labels = [] tmp_vectors = [] for each in training_data: tmp_labels.append(each[0]) tmp_vectors.append(each[1]) prob = svm.svm_problem(tmp_labels, tmp_vectors) training_info = [] training_svm = [] training_table = Table() self.training_table = training_table i = 0 for C in C_range: param = svm.svm_parameter(svm_type=self.svm_type, kernel_type=svm.LINEAR, C=C, p=self.epsilon, nu=self.nu) test_svm = svm.svm_model(prob, param) mse = 0.0 total = len(validation_data) for label, vector in validation_data: pred = test_svm.predict(vector) error = label - pred mse += error * error mse = mse / total training_svm.append(test_svm) training_info.append([C, mse]) training_table.setElement(i, 'C', C) training_table.setElement(i, 'mse', mse) i += 1 if callback != None: callback(int(100 * float(i) / len(C_range))) if verbose: print if verbose: print "------------------------------" if verbose: print " Tuning Information:" if verbose: print " C error" if verbose: print "------------------------------" best = training_info[0] best_svm = training_svm[0] for i in range(len(training_info)): each = training_info[i] if verbose: print " %8.3e %0.8f" % (each[0], each[1]) if best[-1] > each[-1]: best = each best_svm = training_svm[i] if verbose: print "------------------------------" if verbose: print if verbose: print "------------------------------" if verbose: print " Best Tuning:" if verbose: print " C error" if verbose: print "------------------------------" if verbose: print " %8.3e %0.8f" % (best[0], best[1]) if verbose: print "------------------------------" if verbose: print self.training_info = training_info self.C = best[0] self.error = best[1] self.svm = best_svm
return classifier #inject the funcitonality into the vigra.learning.RandomForest class setattr(vigra.learning.RandomForest,"dumpToH5G",dumpRF) setattr(vigra.learning.RandomForest,"reconstructFromH5G", types.MethodType(reconstructRF, vigra.learning.RandomForest)) if __name__ == '__main__': at = vigra.VigraArray.defaultAxistags(4) at.dropChannelAxis() import svm svmmod = svm.svm_model() testObjects = [ numpy.zeros((100,20,7),numpy.uint8), at,[at,numpy.zeros((100,20,7),numpy.uint8)], {"pups" : at}, [at, "test", 42, 42.0, {"42" : 42,"test" : ["test"]}], svmmod] for o in testObjects: f = h5py.File("/tmp/test.h5","w") g = f.create_group("/testg") g.dumpObject(o) o2 = g.reconstructObject() print print "################" print "Original:", o print "------" print "Result :", o2 print o2.__class__
def train_SVR_Linear(self,labels,vectors,verbose, C_range, callback=None): '''Private use only''' # combine the labels and vectors into one set. data = [] for i in range(len(labels)): data.append([labels[i],vectors[i]]) #shuffle the data rng = random.Random() if self.random_seed != None: rng.seed(self.random_seed) rng.shuffle(data) # partition into validation and training if type(self.validation_size) == float and self.validation_size > 0.0 and self.validation_size < 1.0: training_cutoff = int(len(data)*(1.0-self.validation_size)) elif type(self.validation_size) == int and self.validation_size < len(labels): training_cutoff = len(labels)-self.validation_size else: raise NotImplementedError("Cannot determine validation set from %s"%self.validation_size) if verbose: print "Training Cutoff:",len(labels),training_cutoff training_data = data[:training_cutoff] validation_data = data[training_cutoff:] tmp_labels = [] tmp_vectors = [] for each in training_data: tmp_labels.append(each[0]) tmp_vectors.append(each[1]) prob = svm.svm_problem(tmp_labels,tmp_vectors) training_info = [] training_svm = [] training_table = Table() self.training_table = training_table i=0 for C in C_range: param = svm.svm_parameter(svm_type=self.svm_type,kernel_type = svm.LINEAR, C = C, p=self.epsilon,nu=self.nu) test_svm = svm.svm_model(prob, param) mse = 0.0 total = len(validation_data) for label,vector in validation_data: pred = test_svm.predict(vector) error = label - pred mse += error*error mse = mse/total training_svm.append(test_svm) training_info.append([C,mse]) training_table.setElement(i,'C',C) training_table.setElement(i,'mse',mse) i+=1 if callback != None: callback(int(100*float(i)/len(C_range))) if verbose: print if verbose: print "------------------------------" if verbose: print " Tuning Information:" if verbose: print " C error" if verbose: print "------------------------------" best = training_info[0] best_svm = training_svm[0] for i in range(len(training_info)): each = training_info[i] if verbose: print " %8.3e %0.8f"%(each[0],each[1]) if best[-1] > each[-1]: best = each best_svm = training_svm[i] if verbose: print "------------------------------" if verbose: print if verbose: print "------------------------------" if verbose: print " Best Tuning:" if verbose: print " C error" if verbose: print "------------------------------" if verbose: print " %8.3e %0.8f"%(best[0],best[1]) if verbose: print "------------------------------" if verbose: print self.training_info = training_info self.C = best[0] self.error = best[1] self.svm = best_svm
def _load_model_file(self, model_file_path): assert os.path.exists(model_file_path), model_file_path self.model = svm.svm_model(model_file_path)
def loadModel(self, filename): """ Read the SVM model description from a file """ self.model = svm_model(filename)
HOME_PATH = dirname(abspath(__file__)) sys.path.insert(0, HOME_PATH) import redis import simplejson from svm import svm_model from build_svm import url_re, seg def _get_features(): db = redis.StrictRedis() ws = simplejson.loads(db.get('features') or '[]') return [w.encode('utf-8', 'ignore') for w in ws] words = _get_features() snap_model = svm_model(HOME_PATH + '/snap.svm') def predict(text): x = _build_x(text) label = snap_model.predict(x) label = int(label) if label == 1: return True return False def _build_x(text): text = url_re.sub('', text) w_list = seg.cut(text.strip()) w_list.reverse() w_list = [w.encode('utf-8') for w in w_list] features = []
def train(self, features, labels): assert isinstance(labels, np.ndarray), "labels should be numpy array" features = self._cleanse_features(features) problem = svm.svm_problem(labels.tolist(), features) self.model = svm.svm_model(problem, self._svm_parameter)
print '新的特征表存入redis...' print 'len features', len(ls) db = redis.StrictRedis() db.set('features', simplejson.dumps(ls)) f.close() f = open(sample_file, 'r') j = f.read() fy, fx, fd = simplejson.loads(j) f.close() print '训练新的model' prob = svm_problem(fy, fx) param = svm_parameter(kernel_type = LINEAR, C = 80) ## training the model m = svm_model(prob, param) m.save('snap.svm') img = '<img src="%s"></img>' super_count = 0 error_count = 0 html_snap = '' html_trash = '' for i, x in enumerate(fx): label = m.predict(x) if label == 1: html_snap += img % fd[i][0] else: html_trash += img % fd[i][0] if label == fy[i]: super_count += 1