def __init__(self, matrix=None, **kwargs): """Initialize PrecomputedSGKernel Parameters ---------- matrix : SGKernel or Kernel or ndarray Kernel matrix to be used """ # Convert to appropriate kernel for input if isinstance(matrix, SGKernel): k = matrix._k # Take internal shogun elif isinstance(matrix, Kernel): k = matrix.as_raw_np() # Convert to NP otherwise else: # Otherwise SG would segfault ;-) k = np.array(matrix) SGKernel.__init__(self, **kwargs) if versions['shogun:rev'] >= 4455: self._k = sgk.CustomKernel(k) else: raise RuntimeError, \ "Cannot create PrecomputedSGKernel using current version" \ " of shogun -- please upgrade"
def _run_auc (): """Run AUC kernel.""" # handle subkernel params={ 'name': 'Gaussian', 'data': dataop.get_rand(), 'feature_class': 'simple', 'feature_type': 'Real', 'args': {'key': ('size', 'width'), 'val': (10, 1.7)} } feats=featop.get_features( params['feature_class'], params['feature_type'], params['data']) subk=kernel.GaussianKernel(*params['args']['val']) subk.init(feats['train'], feats['test']) output=fileop.get_output(category.KERNEL, params, 'subkernel0_') # handle AUC params={ 'name': 'AUC', 'data': dataop.get_rand(numpy.ushort, num_feats=2, max_train=dataop.NUM_VEC_TRAIN, max_test=dataop.NUM_VEC_TEST), 'feature_class': 'simple', 'feature_type': 'Word', 'accuracy': 1e-8, 'args': {'key': ('size', 'subkernel'), 'val': (10, subk)} } feats=featop.get_features( params['feature_class'], params['feature_type'], params['data']) _compute(feats, params, output)
def _run_combined (): """Run Combined kernel.""" kern=kernel.CombinedKernel() feats={'train': CombinedFeatures(), 'test': CombinedFeatures()} output={} params={ 'name': 'Combined', 'accuracy': 1e-7 } subkdata=[ { 'name': 'FixedDegreeString', 'feature_class': 'string', 'feature_type': 'Char', 'args': {'key': ('size', 'degree'), 'val': (10, 3)} }, { 'name': 'PolyMatchString', 'feature_class': 'string', 'feature_type': 'Char', 'args': { 'key': ('size', 'degree', 'inhomogene'), 'val': (10, 3, True) } }, { 'name': 'LocalAlignmentString', 'feature_class': 'string', 'feature_type': 'Char', 'args': {'key': ('size',), 'val': (10,)} } ] i=0 for sd in subkdata: kfun=eval('kernel.'+sd['name']+'Kernel') subk=kfun(*sd['args']['val']) sd['data']=dataop.get_dna() subkfeats=featop.get_features( sd['feature_class'], sd['feature_type'], sd['data']) output.update( fileop.get_output(category.KERNEL, sd, 'subkernel'+str(i)+'_')) kern.append_kernel(subk) feats['train'].append_feature_obj(subkfeats['train']) feats['test'].append_feature_obj(subkfeats['test']) i+=1 output.update(fileop.get_output(category.KERNEL, params)) kern.init(feats['train'], feats['train']) output['kernel_matrix_train']=kern.get_kernel_matrix() kern.init(feats['train'], feats['test']) output['kernel_matrix_test']=kern.get_kernel_matrix() fileop.write(category.KERNEL, output)
def _run_feats_byte (): """Run kernel with ByteFeatures.""" params={ 'name': 'Linear', 'accuracy': 1e-8, 'feature_class': 'simple', 'feature_type': 'Byte', 'data': dataop.get_rand(dattype=numpy.ubyte), 'normalizer': kernel.AvgDiagKernelNormalizer() } feats=featop.get_features(params['feature_class'], params['feature_type'], params['data'], RAWBYTE) _compute(feats, params)
def _run_feats_word (): """Run kernel with WordFeatures.""" maxval=42 params={ 'name': 'Linear', 'accuracy': 1e-8, 'feature_class': 'simple', 'feature_type': 'Word', 'data': dataop.get_rand( dattype=numpy.ushort, max_train=maxval, max_test=maxval), 'normalizer': kernel.AvgDiagKernelNormalizer() } feats=featop.get_features( params['feature_class'], params['feature_type'], params['data']) _compute(feats, params)
def _run_custom(): """Run Custom kernel.""" params = { 'name': 'Custom', 'accuracy': 1e-7, 'feature_class': 'simple', 'feature_type': 'Real' } dim_square = 7 data = dataop.get_rand(dim_square=dim_square) feats = featop.get_features(params['feature_class'], params['feature_type'], data) data = data['train'] symdata = data + data.T lowertriangle = numpy.array([ symdata[(x, y)] for x in xrange(symdata.shape[1]) for y in xrange(symdata.shape[0]) if y <= x ]) kern = kernel.CustomKernel() #kern.init(feats['train'], feats['train'] kern.set_triangle_kernel_matrix_from_triangle(lowertriangle) km_triangletriangle = kern.get_kernel_matrix() kern.set_triangle_kernel_matrix_from_full(symdata) km_fulltriangle = kern.get_kernel_matrix() kern.set_full_kernel_matrix_from_full(data) km_fullfull = kern.get_kernel_matrix() output = { 'kernel_matrix_triangletriangle': km_triangletriangle, 'kernel_matrix_fulltriangle': km_fulltriangle, 'kernel_matrix_fullfull': km_fullfull, 'kernel_symdata': numpy.matrix(symdata), 'kernel_data': numpy.matrix(data), 'kernel_dim_square': dim_square } output.update(fileop.get_output(category.KERNEL, params)) fileop.write(category.KERNEL, output)
# shogun try: import shogun from shogun import (Kernel as sgKernel, Features as sgFeatures, Classifier as sgClassifier) except ImportError, exc: config.ExternalDepFailed('shogun', exc) else: if os.getenv('MDP_DISABLE_SHOGUN'): config.ExternalDepFailed('shogun', 'disabled') else: # From now on just support shogun >= 1.0 # Between 0.10 to 1.0 there are too many API changes... try: version = sgKernel.Version_get_version_release() except AttributeError: config.ExternalDepFailed( 'shogun', 'too old, upgrade to at least version 1.0') else: if not version.startswith('v1.'): config.ExternalDepFailed( 'shogun', 'too old, upgrade to at least version 1.0.') else: config.ExternalDepFound('shogun', version) # libsvm try: import svm as libsvm libsvm.libsvm except ImportError, exc:
def _run_feats_string(): """Run kernel with StringFeatures.""" params = { 'accuracy': 1e-9, 'data': dataop.get_dna(), 'feature_class': 'string', 'feature_type': 'Char', } feats = featop.get_features(params['feature_class'], params['feature_type'], params['data']) params['name'] = 'FixedDegreeString' params['args'] = {'key': ('size', 'degree'), 'val': (10, 3)} _compute(feats, params) params['accuracy'] = 0 params['name'] = 'LocalAlignmentString' params['args'] = {'key': ('size', ), 'val': (10, )} _compute(feats, params) params['accuracy'] = 1e-10 params['name'] = 'PolyMatchString' params['args'] = { 'key': ('size', 'degree', 'inhomogene'), 'val': (10, 3, True) } _compute(feats, params) params['args']['val'] = (10, 3, False) _compute(feats, params) params['accuracy'] = 1e-15 params['name'] = 'SimpleLocalityImprovedString' params['args'] = { 'key': ('size', 'length', 'inner_degree', 'outer_degree'), 'val': (10, 5, 7, 5) } _compute(feats, params) # buggy: #params['name']='LocalityImprovedString' #_compute(feats, params) params['name'] = 'WeightedDegreeString' params['accuracy'] = 1e-9 params['args'] = {'key': ('degree', ), 'val': (20, )} _compute(feats, params) params['args'] = {'key': ('degree', ), 'val': (1, )} _compute(feats, params) params['name'] = 'WeightedDegreePositionString' params['args'] = {'key': ('size', 'degree'), 'val': (10, 20)} _compute(feats, params) params['args'] = {'key': ('size', 'degree'), 'val': (10, 1)} _compute(feats, params) params['name'] = 'OligoString' params['args'] = {'key': ('size', 'k', 'width'), 'val': (10, 3, 1.2)} _compute(feats, params) params['args'] = {'key': ('size', 'k', 'width'), 'val': (10, 4, 1.7)} _compute(feats, params) params['name'] = 'LinearString' params['accuracy'] = 1e-8 params['normalizer'] = kernel.AvgDiagKernelNormalizer() del params['args'] _compute(feats, params)
def _run_feats_real(): """Run kernel with RealFeatures.""" params = { 'data': dataop.get_rand(), 'accuracy': 1e-8, 'feature_class': 'simple', 'feature_type': 'Real' } feats = featop.get_features(params['feature_class'], params['feature_type'], params['data']) sparsefeats = featop.get_features(params['feature_class'], params['feature_type'], params['data'], sparse=True) params['name'] = 'Gaussian' params['args'] = { 'key': ( 'size', 'width', ), 'val': (10, 1.3) } _compute(feats, params) params['name'] = 'GaussianShift' params['args'] = { 'key': ('size', 'width', 'max_shift', 'shift_step'), 'val': (10, 1.3, 2, 1) } _compute(feats, params) params['name'] = 'SparseGaussian' params['args'] = {'key': ('size', 'width'), 'val': (10, 1.7)} _compute(sparsefeats, params) params['accuracy'] = 0 params['name'] = 'Const' params['args'] = {'key': ('c', ), 'val': (23., )} _compute(feats, params) params['name'] = 'Diag' params['args'] = {'key': ('size', 'diag'), 'val': (10, 23.)} _compute(feats, params) params['accuracy'] = 1e-9 params['name'] = 'Sigmoid' params['args'] = {'key': ('size', 'gamma', 'coef0'), 'val': (10, 1.1, 1.3)} _compute(feats, params) params['args']['val'] = (10, 0.5, 0.7) _compute(feats, params) params['name'] = 'Chi2' params['args'] = {'key': ('size', 'width'), 'val': (10, 1.2)} _compute(feats, params) params['accuracy'] = 1e-8 params['name'] = 'SparsePoly' params['args'] = { 'key': ('size', 'degree', 'inhomogene'), 'val': (10, 3, True) } _compute(sparsefeats, params) params['args']['val'] = (10, 3, False) _compute(sparsefeats, params) params['name'] = 'Poly' params['normalizer'] = kernel.SqrtDiagKernelNormalizer() params['args'] = { 'key': ('size', 'degree', 'inhomogene'), 'val': (10, 3, True) } _compute(feats, params) params['args']['val'] = (10, 3, False) _compute(feats, params) params['normalizer'] = kernel.AvgDiagKernelNormalizer() del params['args'] params['name'] = 'Linear' _compute(feats, params) params['name'] = 'SparseLinear' _compute(sparsefeats, params)
def set_configuration(): # set python version config.ExternalDepFound('python', '.'.join([str(x) for x in sys.version_info])) version = mdp.__version__ if mdp.__revision__: version += ', ' + mdp.__revision__ config.ExternalDepFound('mdp', version) # parallel python dependency try: import pp # set pp secret if not there already # (workaround for debian patch to pp that disables pp's default password) pp_secret = os.getenv('MDP_PP_SECRET') or 'mdp-pp-support-password' # module 'user' has been deprecated since python 2.6 and deleted # completely as of python 3.0. # Basically pp can not work on python 3 at the moment. import user if not hasattr(user, 'pp_secret'): user.pp_secret = pp_secret except ImportError as exc: config.ExternalDepFailed('parallel_python', exc) else: if os.getenv('MDP_DISABLE_PARALLEL_PYTHON'): config.ExternalDepFailed('parallel_python', 'disabled') else: # even if we can import pp, starting the server may still fail # for example with: # OSError: [Errno 12] Cannot allocate memory try: server = pp.Server() server.destroy() except Exception as exc: # no idea what exception the pp server may raise # we need to catch all here... config.ExternalDepFailed('parallel_python', exc) else: if _pp_needs_monkeypatching(): if os.getenv('MDP_DISABLE_MONKEYPATCH_PP'): config.ExternalDepFailed( 'parallel_python', pp.version + ' broken on Debian') else: config.ExternalDepFound('parallel_python', pp.version + '-monkey-patched') config.pp_monkeypatch_dirname = tempfile.gettempdir() else: config.ExternalDepFound('parallel_python', pp.version) # shogun try: import shogun from shogun import (Kernel as sgKernel, Features as sgFeatures, Classifier as sgClassifier) except ImportError as exc: config.ExternalDepFailed('shogun', exc) else: if os.getenv('MDP_DISABLE_SHOGUN'): config.ExternalDepFailed('shogun', 'disabled') else: # From now on just support shogun < 2.0 # Between 0.10 to 1.0 or beyond there are too many API changes... try: version = sgKernel.Version_get_version_release() except AttributeError: config.ExternalDepFailed('shogun', 'only shogun v1 is supported') else: if not version.startswith('v1.'): config.ExternalDepFailed('shogun', 'only shogun v1 is supported') else: config.ExternalDepFound('shogun', version) # libsvm try: import svm as libsvm libsvm.libsvm except ImportError as exc: config.ExternalDepFailed('libsvm', exc) except AttributeError as exc: config.ExternalDepFailed('libsvm', 'libsvm version >= 2.91 required') else: if os.getenv('MDP_DISABLE_LIBSVM'): config.ExternalDepFailed('libsvm', 'disabled') else: config.ExternalDepFound('libsvm', libsvm.libsvm._name) # joblib try: import joblib except ImportError as exc: config.ExternalDepFailed('joblib', exc) else: version = joblib.__version__ if os.getenv('MDP_DISABLE_JOBLIB'): config.ExternalDepFailed('joblib', 'disabled') elif _version_too_old(version, (0, 4, 3)): config.ExternalDepFailed('joblib', 'version %s is too old' % version) else: config.ExternalDepFound('joblib', version) # sklearn try: try: import sklearn except ImportError: import scikits.learn as sklearn version = sklearn.__version__ except ImportError as exc: config.ExternalDepFailed('sklearn', exc) except AttributeError as exc: config.ExternalDepFailed('sklearn', exc) else: if os.getenv('MDP_DISABLE_SKLEARN'): config.ExternalDepFailed('sklearn', 'disabled') elif _version_too_old(version, (0, 6)): config.ExternalDepFailed('sklearn', 'version %s is too old' % version) else: config.ExternalDepFound('sklearn', version)
def _as_raw_sg(kernel): """Converts directly to a Shogun kernel""" return sgk.CustomKernel(kernel.as_raw_np())
def train_attribute(attribute_id, C, split=0): from shogun import Classifier, Features, Kernel, Distance attribute_id = int(attribute_id) print "# attribute ", attributenames[attribute_id] C = float(C) print "# C ", C if split == 0: train_classes = loadstr( '/nfs3group/chlgrp/datasets/Animals_with_Attributes/trainclasses.txt' ) test_classes = loadstr( '/nfs3group/chlgrp/datasets/Animals_with_Attributes/testclasses.txt' ) else: classnames = loadstr( '/nfs3group/chlgrp/datasets/Animals_with_Attributes/classnames.txt' ) startid = (split - 1) * 10 stopid = split * 10 test_classes = classnames[startid:stopid] train_classes = classnames[0:startid] + classnames[stopid:] Xtrn, Ltrn = create_data(train_classes, attribute_id) Xtst, Ltst = create_data(test_classes, attribute_id) if min(Ltrn) == max(Ltrn): # only 1 class Lprior = mean(Ltrn) prediction = sign(Lprior) * ones(len(Ltst)) probabilities = 0.1 + 0.8 * 0.5 * (Lprior + 1.) * ones( len(Ltst)) # fallback return prediction, probabilities, Ltst #sg('loglevel', 'WARN') widths = {} for feature in all_features: traindata = array(Xtrn[feature][:, ::50], float) # used to be 5*offset trainfeat = Features.RealFeatures(traindata) DM = Distance.ChiSquareDistance(trainfeat, trainfeat).get_distance_matrix() widths[feature] = median(DM.flatten()) del traindata, trainfeat, DM s = Classifier.LibSVM() #sg('new_svm', 'LIBSVM') #sg('use_mkl', False) # we use fixed weights here #sg('clean_features', 'TRAIN') #sg('clean_features', 'TEST') Lplatt_trn = concatenate([Ltrn[i::10] for i in range(9)]) # 90% for training Lplatt_val = Ltrn[9::10] # remaining 10% for platt scaling feats_trn = Features.CombinedFeatures() feats_val = Features.CombinedFeatures() for feature in all_features: Xplatt_trn = concatenate([Xtrn[feature][:, i::10] for i in range(9)], axis=1) feats_trn.append_feature_obj( Features.RealFeatures(ascontiguousarray(Xplatt_trn))) #sg('add_features', 'TRAIN', Xplatt_trn) Xplatt_val = Xtrn[feature][:, 9::10] feats_val.append_feature_obj( Features.RealFeatures(ascontiguousarray(Xplatt_val))) #sg('add_features', 'TEST', Xplatt_val) del Xplatt_trn, Xplatt_val, Xtrn[feature] labels_trn = Features.Labels(Lplatt_trn) #sg('set_labels', 'TRAIN', Lplatt_trn) kernel = Kernel.CombinedKernel() #sg('set_kernel', 'COMBINED', 5000) for featureset in all_features: kernel.append_kernel(Kernel.Chi2Kernel(5000, widths[featureset] / 5.)) #sg('add_kernel', 1., 'CHI2', 'REAL', 10, widths[featureset]/5. ) kernel.init(feats_trn, feats_trn) K = kernel.get_kernel_matrix() K.tofile('/scratch/chl/cvfold%d_C%g_%02d-trn.kernel' % (split, C, attribute_id)) del K s.set_max_train_time(600 * 60.) #sg('svm_max_train_time', 600*60.) # one hour should be plenty s.set_C(C, C) #sg('c', C) s.set_kernel(kernel) s.set_labels(labels_trn) #sg('init_kernel', 'TRAIN') try: s.train() #sg('train_classifier') except (RuntimeWarning, RuntimeError ): # can't train, e.g. all samples have the same labels Lprior = mean(Ltrn) prediction = sign(Lprior) * ones(len(Ltst)) probabilities = 0.1 + 0.8 * 0.5 * (Lprior + 1.) * ones(len(Ltst)) savetxt('./DAP/cvfold%d_C%g_%02d.txt' % (split, C, attribute_id), prediction) savetxt('./DAP/cvfold%d_C%g_%02d.prob' % (split, C, attribute_id), probabilities) savetxt('./DAP/cvfold%d_C%g_%02d.labels' % (split, C, attribute_id), Ltst) return prediction, probabilities, Ltst bias = s.get_bias() alphas = s.get_alphas() #[bias, alphas]=sg('get_svm') #print bias,alphas kernel.init(feats_trn, feats_val) K = kernel.get_kernel_matrix() K.tofile('/scratch/chl/cvfold%d_C%g_%02d-val.kernel' % (split, C, attribute_id)) del K #sg('init_kernel', 'TEST') try: prediction = s.classify().get_labels() #prediction=sg('classify') platt_params = SigmoidTrain(prediction, Lplatt_val) probabilities = SigmoidPredict(prediction, platt_params) savetxt('./DAP/cvfold%d_C%g_%02d-val.txt' % (split, C, attribute_id), prediction) savetxt('./DAP/cvfold%d_C%g_%02d-val.prob' % (split, C, attribute_id), probabilities) savetxt( './DAP/cvfold%d_C%g_%02d-val.labels' % (split, C, attribute_id), Lplatt_val) savetxt('./DAP/cvfold%d_C%g_%02d-val.platt' % (split, C, attribute_id), platt_params) #print '#train-perf ',attribute_id,C,mean((prediction*Lplatt_val)>0),mean(Lplatt_val>0) #print '#platt-perf ',attribute_id,C,mean((sign(probabilities-0.5)*Lplatt_val)>0),mean(Lplatt_val>0) except RuntimeError: Lprior = mean(Ltrn) prediction = sign(Lprior) * ones(len(Ltst)) probabilities = 0.1 + 0.8 * 0.5 * (Lprior + 1.) * ones(len(Ltst)) print >> sys.stderr, "#Error during testing. Using constant platt scaling" platt_params = [1., 0.] # ----------------------------- now apply to test classes ------------------ feats_tst = Features.CombinedFeatures() #sg('clean_features', 'TEST') for feature in all_features: feats_tst.append_feature_obj( Features.RealFeatures(ascontiguousarray(Xtst[feature]))) del Xtst[feature] kernel.init(feats_trn, feats_tst) K = kernel.get_kernel_matrix() K.tofile('/scratch/chl/cvfold%d_C%g_%02d-tst.kernel' % (split, C, attribute_id)) del K #sg('init_kernel', 'TEST') prediction = s.classify().get_labels() #prediction=sg('classify') probabilities = SigmoidPredict(prediction, platt_params) savetxt('./DAP/cvfold%d_C%g_%02d.txt' % (split, C, attribute_id), prediction) savetxt('./DAP/cvfold%d_C%g_%02d.prob' % (split, C, attribute_id), probabilities) savetxt('./DAP/cvfold%d_C%g_%02d.labels' % (split, C, attribute_id), Ltst) #print '#test-perf ',attribute_id,C,mean((prediction*Ltst)>0),mean(Ltst>0) #print '#platt-perf ',attribute_id,C,mean((sign(probabilities-0.5)*Ltst)>0),mean(Ltst>0) return prediction, probabilities, Ltst