def prepare_data(use_toy=True): from os.path import exists from tools.load import LoadMatrix lm=LoadMatrix() if not use_toy and exists('../data/../mldata/uci-20070111-optdigits.mat'): from scipy.io import loadmat mat = loadmat('../data/../mldata/uci-20070111-optdigits.mat')['int0'].astype(float) X = mat[:-1,:] Y = mat[-1,:] isplit = X.shape[1]/2 traindat = X[:,:isplit] label_traindat = Y[:isplit] testdat = X[:, isplit:] label_testdat = Y[isplit:] else: traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') label_traindat = lm.load_labels('../data/label_train_multiclass.dat') label_testdat = None return [traindat, label_traindat, testdat, label_testdat]
#!/usr/bin/env python from tools.load import LoadMatrix lm = LoadMatrix() train_dna = lm.load_dna("../data/fm_train_dna.dat") test_dna = lm.load_dna("../data/fm_test_dna.dat") label = lm.load_labels("../data/label_train_dna.dat") parameter_list = [[train_dna, test_dna, label, 20, 0.9, 1e-3, 1], [train_dna, test_dna, label, 20, 2.3, 1e-5, 4]] def classifier_svmlight_batch_linadd_modular( fm_train_dna, fm_test_dna, label_train_dna, degree, C, epsilon, num_threads ): from modshogun import StringCharFeatures, BinaryLabels, DNA from modshogun import WeightedDegreeStringKernel, MSG_DEBUG try: from modshogun import SVMLight except ImportError: print("No support for SVMLight available.") return feats_train = StringCharFeatures(DNA) # feats_train.io.set_loglevel(MSG_DEBUG) feats_train.set_features(fm_train_dna) feats_test = StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) degree = 20
#!/usr/bin/env python from tools.load import LoadMatrix from numpy import random lm=LoadMatrix() random.seed(17) ground_truth = lm.load_labels('../data/label_train_multiclass.dat') predicted = lm.load_labels('../data/label_train_multiclass.dat') * 2 parameter_list = [[ground_truth,predicted]] def evaluation_multiclassaccuracy_modular (ground_truth, predicted): from shogun.Features import MulticlassLabels from shogun.Evaluation import MulticlassAccuracy ground_truth_labels = MulticlassLabels(ground_truth) predicted_labels = MulticlassLabels(predicted) evaluator = MulticlassAccuracy() accuracy = evaluator.evaluate(predicted_labels,ground_truth_labels) return accuracy if __name__=='__main__': print('MulticlassAccuracy') evaluation_multiclassaccuracy_modular(*parameter_list[0])
epsilon=1e-5 labels=Labels(label_train_twoclass) svm=LibSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train() #kernel.init(feats_train, feats_test) output = svm.classify(feats_test)#.get_labels() #output_vector = output.get_labels() out=svm.classify().get_labels() testerr=mean(sign(out)!=testlab) print testerr #sv_idx=svm.get_support_vectors() #alphas=svm.get_alphas() #pm = PerformanceMeasures(output_vector, output) #acc = pm.get_accuracy() #roc = pm.get_auROC() #fms = pm.get_fmeasure() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('/home/mati/lib/shogun-0.9.3/examples/documented/data/fm_train_real.dat') fm_test_real=lm.load_numbers('/home/mati/lib/shogun-0.9.3/examples/documented/data/fm_test_real.dat') label_train_twoclass=lm.load_labels('/home/mati/lib/shogun-0.9.3/examples/documented/data/label_train_twoclass.dat') libsvm()
realfeat = RealFeatures(fm_train_real) feats_train = SparseRealFeatures() feats_train.obtain_from_simple(realfeat) realfeat = RealFeatures(fm_test_real) feats_test = SparseRealFeatures() feats_test.obtain_from_simple(realfeat) C = 0.9 epsilon = 1e-5 num_threads = 1 labels = Labels(label_train_twoclass) svm = SVMOcas(C, feats_train, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.set_bias_enabled(False) svm.train() svm.set_features(feats_test) svm.classify().get_labels() if __name__ == "__main__": from tools.load import LoadMatrix lm = LoadMatrix() fm_train_real = lm.load_numbers("../data/fm_train_real.dat") fm_test_real = lm.load_numbers("../data/fm_test_real.dat") label_train_twoclass = lm.load_labels("../data/label_train_twoclass.dat") svmocas()
# the precision parameter epsilon=1e-5. # # For more details on the SVM^light see # T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel # Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999. # # For more details on the Weighted Degree kernel see # G. Raetsch, S.Sonnenburg, and B. Schoelkopf. RASE: recognition of alternatively # spliced exons in C. elegans. Bioinformatics, 21:369-377, June 2005. from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') label_traindat = lm.load_labels('../data/label_train_dna.dat') parameter_list = [[traindat,testdat,label_traindat,1.1,1e-5,1],[traindat,testdat,label_traindat,1.2,1e-5,1]] def classifier_svmlight_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,C=1.2,epsilon=1e-5,num_threads=1): from shogun.Features import StringCharFeatures, Labels, DNA from shogun.Kernel import WeightedDegreeStringKernel try: from shogun.Classifier import SVMLight except ImportError: print 'No support for SVMLight available.' return feats_train=StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test=StringCharFeatures(DNA)
from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') train_label=lm.load_labels('../data/label_train_twoclass.dat') parameter_list=[[traindat,testdat, train_label,10,2.1,1.2,1e-5,False], [traindat,testdat,train_label,10,2.1,1.3,1e-4,False]] def classifier_libsvm (fm_train_real=traindat,fm_test_real=testdat, label_train_twoclass=train_label, size_cache=10, width=2.1,C=1.2, epsilon=1e-5,use_bias=False): sg('set_features', 'TRAIN', fm_train_real) sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width) sg('set_labels', 'TRAIN', label_train_twoclass) sg('new_classifier', 'LIBSVM') sg('svm_epsilon', epsilon) sg('c', C) sg('svm_use_bias', use_bias) sg('train_classifier') sg('set_features', 'TEST', fm_test_real) result=sg('classify') kernel_matrix = sg('get_kernel_matrix', 'TEST') return result, kernel_matrix if __name__=='__main__': print('LibSVM')
# This example shows how to compute the Hamming Word Distance for string features. from tools.load import LoadMatrix lm=LoadMatrix() traindna = lm.load_dna('../data/fm_train_dna.dat') testdna = lm.load_dna('../data/fm_test_dna.dat') testdat = lm.load_labels('../data/fm_test_real.dat') parameter_list = [[traindna,testdna,testdat,4,0,False,False], [traindna,testdna,testdat,3,0,False,False]] def distance_hammingword_modular (fm_train_dna=traindna,fm_test_dna=testdna, fm_test_real=testdat,order=3,gap=0,reverse=False,use_sign=False): from shogun.Features import StringCharFeatures, StringWordFeatures, DNA from shogun.Preprocessor import SortWordString from shogun.Distance import HammingWordDistance charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet())
from tools.load import LoadMatrix from sg import sg lm = LoadMatrix() traindat = lm.load_numbers("../data/fm_train_real.dat") testdat = lm.load_numbers("../data/fm_test_real.dat") train_label = lm.load_labels("../data/label_train_multiclass.dat") parameter_list = [[traindat, testdat, train_label, 3], [traindat, testdat, train_label, 4]] def classifier_knn(fm_train_real=traindat, fm_test_real=testdat, label_train_multiclass=train_label, k=3): sg("set_features", "TRAIN", fm_train_real) sg("set_labels", "TRAIN", label_train_multiclass) sg("set_distance", "EUCLIDIAN", "REAL") sg("new_classifier", "KNN") sg("train_classifier", k) sg("set_features", "TEST", fm_test_real) result = sg("classify") return result if __name__ == "__main__": print("KNN") classifier_knn(*parameter_list[0])
from tools.load import LoadMatrix lm = LoadMatrix() traindat = lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') label_traindat = lm.load_labels('../data/label_train_dna.dat') parameter_list = [[traindat, testdat, label_traindat, 3, 0, False], [traindat, testdat, label_traindat, 3, 0, False]] def kernel_histogram_word_string_modular(fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, order=3, gap=0, reverse=False): from shogun.Features import StringCharFeatures, StringWordFeatures, DNA, Labels from shogun.Kernel import HistogramWordStringKernel from shogun.Classifier import PluginEstimate #, MSG_DEBUG reverse = reverse charfeat = StringCharFeatures(DNA) #charfeat.io.set_loglevel(MSG_DEBUG) charfeat.set_features(fm_train_dna) feats_train = StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test = StringWordFeatures(charfeat.get_alphabet())
#!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindna = lm.load_dna('../data/fm_train_dna.dat') testdna = lm.load_dna('../data/fm_test_dna.dat') testdat = lm.load_labels('../data/fm_test_real.dat') parameter_list = [[traindna,testdna,testdat,4,0,False,False], [traindna,testdna,testdat,3,0,False,False]] def distance_hammingword (fm_train_dna=traindna,fm_test_dna=testdna, fm_test_real=testdat,order=3,gap=0,reverse=False,use_sign=False): from shogun import StringCharFeatures, StringWordFeatures, DNA from shogun import SortWordString from shogun import HammingWordDistance charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
# In this example PRC (Precision-Recall curve) is being computed # for the pair of ground truth toy labels and random labels. # PRC curve (as matrix) and auPRC (area under PRC) is returned. from tools.load import LoadMatrix from numpy import random lm = LoadMatrix() ground_truth = lm.load_labels('../data/label_train_twoclass.dat') random.seed(17) predicted = random.randn(len(ground_truth)) parameter_list = [[ground_truth, predicted]] def evaluation_prcevaluation_modular(ground_truth, predicted): from shogun.Features import BinaryLabels from shogun.Evaluation import PRCEvaluation ground_truth_labels = BinaryLabels(ground_truth) predicted_labels = BinaryLabels(predicted) evaluator = PRCEvaluation() evaluator.evaluate(predicted_labels, ground_truth_labels) return evaluator.get_PRC(), evaluator.get_auPRC() if __name__ == '__main__': print('PRCEvaluation') evaluation_prcevaluation_modular(*parameter_list[0])
# In this example a multiclass accuracy is being computed for toy data labels # and toy data labels multiplied by two. from tools.load import LoadMatrix from numpy import random lm = LoadMatrix() random.seed(17) ground_truth = lm.load_labels('../data/label_train_multiclass.dat') predicted = lm.load_labels('../data/label_train_multiclass.dat') * 2 parameter_list = [[ground_truth, predicted]] def evaluation_multiclassaccuracy_modular(ground_truth, predicted): from shogun.Features import Labels from shogun.Evaluation import MulticlassAccuracy ground_truth_labels = Labels(ground_truth) predicted_labels = Labels(predicted) evaluator = MulticlassAccuracy() accuracy = evaluator.evaluate(predicted_labels, ground_truth_labels) return accuracy if __name__ == '__main__': print 'MulticlassAccuracy' evaluation_multiclassaccuracy_modular(*parameter_list[0])
#!/usr/bin/env python """ Explicit examples on how to use the different classifiers """ from numpy import double, array, floor, concatenate, sign, ones, zeros, char, int from numpy.random import rand, seed, permutation from sg import sg from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') label_train_dna=lm.load_labels('../data/label_train_dna.dat') label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat') label_train_multiclass=lm.load_labels('../data/label_train_multiclass.dat') ########################################################################### # kernel-based SVMs ########################################################################### def svm_light (): print 'SVMLight' size_cache=10 degree=20 C=0.017 epsilon=1e-5 use_bias=False
# The base kernels are then subsequently added to a CombinedKernel, which # contains a weight for each kernel and encapsulates the base kernels # from the training procedure. When the CombinedKernel between two examples is # evaluated it computes the corresponding linear combination of kernels according to their weights. # We then show how to create an MKLMultiClass classifier that trains an SVM and learns the optimal # weighting of kernels (w.r.t. a given norm q) at the same time. The main difference to the binary # classification version of MKL is that we can use more than two values as labels, when training # the classifier. # Finally, the example shows how to classify with a trained MKLMultiClass classifier. # from tools.load import LoadMatrix lm = LoadMatrix() fm_train_real = lm.load_numbers('../data/fm_train_real.dat') fm_test_real = lm.load_numbers('../data/fm_test_real.dat') label_train_multiclass = lm.load_labels('../data/label_train_multiclass.dat') parameter_list=[ [ fm_train_real, fm_test_real, label_train_multiclass, 1.2, 1.2, 1e-5, 1, 0.001, 1.5], [ fm_train_real, fm_test_real, label_train_multiclass, 5, 1.2, 1e-2, 1, 0.001, 2]] def mkl_multiclass_modular(fm_train_real, fm_test_real, label_train_multiclass, width, C, epsilon, num_threads, mkl_epsilon, mkl_norm): from shogun.Features import CombinedFeatures, RealFeatures, MulticlassLabels from shogun.Kernel import CombinedKernel, GaussianKernel, LinearKernel,PolyKernel from shogun.Classifier import MKLMulticlass kernel = CombinedKernel() feats_train = CombinedFeatures() feats_test = CombinedFeatures()
from tools.load import LoadMatrix from numpy import random lm=LoadMatrix() ground_truth = lm.load_labels('../data/label_train_twoclass.dat') random.seed(17) predicted = random.randn(len(ground_truth)) parameter_list = [[ground_truth,predicted]] def evaluation_prcevaluation_modular(ground_truth, predicted): from shogun.Features import Labels from shogun.Evaluation import PRCEvaluation ground_truth_labels = Labels(ground_truth) predicted_labels = Labels(predicted) evaluator = PRCEvaluation() evaluator.evaluate(predicted_labels,ground_truth_labels) return evaluator.get_PRC(), evaluator.get_auPRC() if __name__=='__main__': print 'PRCEvaluation' evaluation_prcevaluation_modular(*parameter_list[0])
from tools.load import LoadMatrix lm = LoadMatrix() traindna = lm.load_dna("../data/fm_train_dna.dat") testdna = lm.load_dna("../data/fm_test_dna.dat") testdat = lm.load_labels("../data/fm_test_real.dat") parameter_list = [[traindna, testdna, testdat, 4, 0, False, False], [traindna, testdna, testdat, 3, 0, False, False]] def distance_hammingword_modular( fm_train_dna=traindna, fm_test_dna=testdna, fm_test_real=testdat, order=3, gap=0, reverse=False, use_sign=False ): from shogun.Features import StringCharFeatures, StringWordFeatures, DNA from shogun.Preprocessor import SortWordString from shogun.Distance import HammingWordDistance charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train = StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) preproc = SortWordString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test = StringWordFeatures(charfeat.get_alphabet())
from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') label_traindat = lm.load_labels('../data/label_train_multiclass.dat') parameter_list = [[traindat,testdat,label_traindat,0.9,1,2000],[traindat,testdat,label_traindat,3,1,5000]] def classifier_larank_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,C=0.9,num_threads=1,num_iter=5): from shogun.Features import RealFeatures, Labels from shogun.Kernel import GaussianKernel from shogun.Classifier import LaRank from shogun.Mathematics import Math_init_random Math_init_random(17) feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) width=2.1 kernel=GaussianKernel(feats_train, feats_train, width) epsilon=1e-5 labels=Labels(label_train_multiclass) svm=LaRank(C, kernel, labels) #svm.set_tau(1e-3) svm.set_batch_mode(False) #svm.io.enable_progress() svm.set_epsilon(epsilon) svm.train() out=svm.apply(feats_train).get_labels()
# In this example a support vector regression algorithm is trained on a # real-valued toy data set. The underlying library used for the SVR training is # LIBSVM. The SVR is trained with regularization parameter C=1 and a gaussian # kernel with width=2.1. # # For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/ . from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') trainlabel=lm.load_labels('../data/label_train_regression.dat') parameter_list=[[traindat,testdat,trainlabel,10,2.1,1.2,1e-5,1e-2], [traindat,testdat,trainlabel,11,2.3,1.3,1e-6,1e-3]] def regression_libsvr (fm_train=traindat,fm_test=testdat, label_train=trainlabel,size_cache=10,width=2.1, C=1.2,epsilon=1e-5,tube_epsilon=1e-2): sg('set_features', 'TRAIN', fm_train) sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width) sg('set_labels', 'TRAIN', label_train) sg('new_regression', 'LIBSVR') sg('svr_tube_epsilon', tube_epsilon) sg('c', C) sg('train_regression') sg('set_features', 'TEST', fm_test)
# In this example a two-class linear support vector machine classifier is trained # on a toy data set and the trained classifier is used to predict labels of test # examples. As training algorithm the Stochastic Gradient Descent (SGD) solver is # used with the SVM regularization parameter C=0.9. The number of iterations, i.e. # passes though all training examples, is set to num_iter=5 . # # For more details on the SGD solver see # L. Bottou, O. Bousquet. The tradeoff of large scale learning. In NIPS 20. MIT # Press. 2008. from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') label_traindat = lm.load_labels('../data/label_train_twoclass.dat') parameter_list = [[traindat,testdat,label_traindat,0.9,1,6],[traindat,testdat,label_traindat,0.8,1,5]] def classifier_svmsgd_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_twoclass=label_traindat,C=0.9,num_threads=1,num_iter=5): from shogun.Features import RealFeatures, SparseRealFeatures, Labels from shogun.Classifier import SVMSGD realfeat=RealFeatures(fm_train_real) feats_train=SparseRealFeatures() feats_train.obtain_from_simple(realfeat) realfeat=RealFeatures(fm_test_real) feats_test=SparseRealFeatures() feats_test.obtain_from_simple(realfeat)
from tools.load import LoadMatrix from sg import sg lm = LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') trainlabel = lm.load_labels('../data/label_train_regression.dat') parameter_list = [[traindat, testdat, trainlabel, 10, 2.1, 1.2, 1e-5, 1e-2], [traindat, testdat, trainlabel, 11, 2.3, 1.3, 1e-6, 1e-3]] def regression_libsvr(fm_train=traindat, fm_test=testdat, label_train=trainlabel, size_cache=10, width=2.1, C=1.2, epsilon=1e-5, tube_epsilon=1e-2): sg('set_features', 'TRAIN', fm_train) sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width) sg('set_labels', 'TRAIN', label_train) sg('new_regression', 'LIBSVR') sg('svr_tube_epsilon', tube_epsilon) sg('c', C) sg('train_regression') sg('set_features', 'TEST', fm_test) result = sg('classify')