示例#1
0
 def createFeatures(self, examples):
     """Converts numpy arrays or sequences into shogun features"""
     if self.kparam['name'] == 'gauss' or self.kparam['name'] == 'linear' or self.kparam['name'] == 'poly':
         examples = numpy.array(examples)
         feats = RealFeatures(examples)
         
     elif self.kparam['name'] == 'wd' or self.kparam['name'] == 'localalign' or self.kparam['name'] == 'localimprove':
         #examples = non_atcg_convert(examples, nuc_con)
         feats = StringCharFeatures(examples, DNA)
     elif self.kparam['name'] == 'spec':
         #examples = non_atcg_convert(examples, nuc_con)
         feats = StringCharFeatures(examples, DNA) 
    
         wf = StringUlongFeatures( feats.get_alphabet() )
         wf.obtain_from_char(feats, kparam['degree']-1, kparam['degree'], 0, kname=='cumspec')
         del feats
 
         if train_mode:
             preproc = SortUlongString()
             preproc.init(wf)
         wf.add_preproc(preproc)
         ret = wf.apply_preproc()
         feats = wf 
 
     else:
         print 'Unknown kernel %s' % self.kparam['name']
         raise ValueError
     
     return feats
def features_simple_modular(A=matrixA,B=matrixB,C=matrixC):

    a=RealFeatures(A)
    b=LongIntFeatures(B)
    c=ByteFeatures(C)
    
# or 16bit wide ...
#feat1 = f.ShortFeatures(N.zeros((10,5),N.short))
#feat2 = f.WordFeatures(N.zeros((10,5),N.uint16))


# print some statistics about a

# get first feature vector and set it

    a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0)

# get matrices
    a_out = a.get_feature_matrix()
    b_out = b.get_feature_matrix()
    c_out = c.get_feature_matrix()

    assert(all(a_out==A))

    assert(all(b_out==B))

    assert(all(c_out==C))
    return a_out,b_out,c_out,a,b,c
def prune_var_sub_mean ():
	print 'PruneVarSubMean'

	from shogun.Kernel import Chi2Kernel
	from shogun.Features import RealFeatures
	from shogun.PreProc import PruneVarSubMean

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	preproc=PruneVarSubMean()
	preproc.init(feats_train)
	feats_train.add_preproc(preproc)
	feats_train.apply_preproc()
	feats_test.add_preproc(preproc)
	feats_test.apply_preproc()

	width=1.4
	size_cache=10
	
	kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
def norm_one ():
	print 'NormOne'

	from shogun.Kernel import Chi2Kernel
	from shogun.Features import RealFeatures
	from shogun.PreProc import NormOne

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	preproc=NormOne()
	preproc.init(feats_train)
	feats_train.add_preproc(preproc)
	feats_train.apply_preproc()
	feats_test.add_preproc(preproc)
	feats_test.apply_preproc()

	width=1.4
	size_cache=10
	
	kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
def features_dense_zero_copy_modular (in_data=data):
	feats = None
	if numpy.__version__ >= '1.5':
		feats=numpy.array(in_data, dtype=float64, order='F')		

		a=RealFeatures()
		a.frombuffer(feats, False)

		b=numpy.array(a, copy=False)
		c=numpy.array(a, copy=True)

		d=RealFeatures()
		d.frombuffer(a, False)

		e=RealFeatures()
		e.frombuffer(a, True)

		a[:,0]=0
		print a[0:4]
		print b[0:4]
		print c[0:4]
		print d[0:4]
		print e[0:4]
	else:
		print "numpy version >= 1.5 is needed"

	return feats
def modelselection_grid_search_kernel():
	num_subsets=3
	num_vectors=20
	dim_vectors=3

	# create some (non-sense) data
	matrix=rand(dim_vectors, num_vectors)

	# create num_feautres 2-dimensional vectors
	features=RealFeatures()
	features.set_feature_matrix(matrix)

	# create labels, two classes
	labels=BinaryLabels(num_vectors)
	for i in range(num_vectors):
		labels.set_label(i, 1 if i%2==0 else -1)

	# create svm
	classifier=LibSVM()

	# splitting strategy
	splitting_strategy=StratifiedCrossValidationSplitting(labels, num_subsets)

	# accuracy evaluation
	evaluation_criterion=ContingencyTableEvaluation(ACCURACY)

	# cross validation class for evaluation in model selection
	cross=CrossValidation(classifier, features, labels, splitting_strategy, evaluation_criterion)
	cross.set_num_runs(1)

	# print all parameter available for modelselection
	# Dont worry if yours is not included, simply write to the mailing list
	classifier.print_modsel_params()

	# model parameter selection
	param_tree=create_param_tree()
	param_tree.print_tree()

	grid_search=GridSearchModelSelection(param_tree, cross)

	print_state=True
	best_combination=grid_search.select_model(print_state)
	print("best parameter(s):")
	best_combination.print_tree()

	best_combination.apply_to_machine(classifier)

	# larger number of runs to have tighter confidence intervals
	cross.set_num_runs(10)
	cross.set_conf_int_alpha(0.01)
	result=cross.evaluate()
	print("result: ")
	result.print_result()

	return 0
def distance_mahalanobis_modular (fm_train_real = traindat, fm_test_real = testdat):

	from shogun.Features import RealFeatures
	from shogun.Distance import MahalanobisDistance

	feats_train = RealFeatures(fm_train_real)
	feats_test  = RealFeatures(fm_test_real)

	distance = MahalanobisDistance(feats_test, feats_train)
	for i in range(feats_test.get_num_vectors()):
		for j in range(feats_train.get_num_vectors()):
			dm = distance.distance(i, j)
			print dm
def distance_braycurtis_modular(fm_train_real=traindat, fm_test_real=testdat):

    from shogun.Features import RealFeatures
    from shogun.Distance import BrayCurtisDistance

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)

    distance = BrayCurtisDistance(feats_train, feats_train)

    dm_train = distance.get_distance_matrix()
    distance.init(feats_train, feats_test)
    dm_test = distance.get_distance_matrix()

    return distance, dm_train, dm_test
示例#9
0
def kernel_sigmoid_modular(fm_train_real=traindat,fm_test_real=testdat,size_cache=10,gamma=1.2,coef0=1.3):

	from shogun.Features import RealFeatures
	from shogun.Kernel import SigmoidKernel

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)
	

	kernel=SigmoidKernel(feats_train, feats_train, size_cache, gamma, coef0)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def classifier_gaussiannaivebayes_modular(
        fm_train_real=traindat,
        fm_test_real=testdat,
        label_train_multiclass=label_traindat):
    from shogun.Features import RealFeatures, Labels
    from shogun.Classifier import GaussianNaiveBayes

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)
    labels = Labels(label_train_multiclass)

    gnb = GaussianNaiveBayes(feats_train, labels)
    gnb_train = gnb.train()
    output = gnb.apply(feats_test).get_labels()
    return gnb, gnb_train, output
def distance_chebyshew_modular(fm_train_real=traindat, fm_test_real=testdat):

    from shogun.Features import RealFeatures
    from shogun.Distance import ChebyshewMetric

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)

    distance = ChebyshewMetric(feats_train, feats_train)

    dm_train = distance.get_distance_matrix()
    distance.init(feats_train, feats_test)
    dm_test = distance.get_distance_matrix()

    return distance, dm_train, dm_test
示例#12
0
def classifier_multiclass_ecoc_random(fm_train_real=traindat,
                                      fm_test_real=testdat,
                                      label_train_multiclass=label_traindat,
                                      label_test_multiclass=label_testdat,
                                      lawidth=2.1,
                                      C=1,
                                      epsilon=1e-5):
    from shogun.Features import RealFeatures, MulticlassLabels
    from shogun.Classifier import LibLinear, L2R_L2LOSS_SVC, LinearMulticlassMachine
    from shogun.Classifier import ECOCStrategy, ECOCRandomSparseEncoder, ECOCRandomDenseEncoder, ECOCHDDecoder

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)

    labels = MulticlassLabels(label_train_multiclass)

    classifier = LibLinear(L2R_L2LOSS_SVC)
    classifier.set_epsilon(epsilon)
    classifier.set_bias_enabled(True)

    rnd_dense_strategy = ECOCStrategy(ECOCRandomDenseEncoder(),
                                      ECOCHDDecoder())
    rnd_sparse_strategy = ECOCStrategy(ECOCRandomSparseEncoder(),
                                       ECOCHDDecoder())

    dense_classifier = LinearMulticlassMachine(rnd_dense_strategy, feats_train,
                                               classifier, labels)
    dense_classifier.train()
    label_dense = dense_classifier.apply(feats_test)
    out_dense = label_dense.get_labels()

    sparse_classifier = LinearMulticlassMachine(rnd_sparse_strategy,
                                                feats_train, classifier,
                                                labels)
    sparse_classifier.train()
    label_sparse = sparse_classifier.apply(feats_test)
    out_sparse = label_sparse.get_labels()

    if label_test_multiclass is not None:
        from shogun.Evaluation import MulticlassAccuracy
        labels_test = MulticlassLabels(label_test_multiclass)
        evaluator = MulticlassAccuracy()
        acc_dense = evaluator.evaluate(label_dense, labels_test)
        acc_sparse = evaluator.evaluate(label_sparse, labels_test)
        print('Random Dense Accuracy  = %.4f' % acc_dense)
        print('Random Sparse Accuracy = %.4f' % acc_sparse)

    return out_sparse, out_dense
示例#13
0
def compute_output_plot_isolines(classifier, kernel=None, train=None, sparse=False, pos=None, neg=None, regression=False):
	size=100
	if pos is not None and neg is not None:
		x1_max=max(1.2*pos[0,:])
		x1_min=min(1.2*neg[0,:])
		x2_min=min(1.2*neg[1,:])
		x2_max=max(1.2*pos[1,:])
		x1=linspace(x1_min, x1_max, size)
		x2=linspace(x2_min, x2_max, size)
	else:
		x1=linspace(-5, 5, size)
		x2=linspace(-5, 5, size)

	x, y=meshgrid(x1, x2)

	dense=RealFeatures(array((ravel(x), ravel(y))))
	if sparse:
		test=SparseRealFeatures()
		test.obtain_from_simple(dense)
	else:
		test=dense

	if kernel and train:
		kernel.init(train, test)
	else:
		classifier.set_features(test)

	labels = None
	if regression:
		labels=classifier.apply().get_labels()
	else:
		labels=classifier.apply().get_values()
	z=labels.reshape((size, size))

	return x, y, z
示例#14
0
def kernel_wave_modular (fm_train_real=traindat,fm_test_real=testdat, theta=1.0):
	from shogun.Features import RealFeatures
	from shogun.Kernel import WaveKernel
	from shogun.Distance import EuclideanDistance

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)
	
	distance=EuclideanDistance(feats_train, feats_train)

	kernel=WaveKernel(feats_train, feats_train, theta, distance)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
示例#15
0
def statistics_kmm (n,d):
	from shogun.Features import RealFeatures
	from shogun.Features import DataGenerator
	from shogun.Kernel import GaussianKernel, MSG_DEBUG
	from shogun.Statistics import KernelMeanMatching
	from shogun.Mathematics import Math

	# init seed for reproducability
	Math.init_random(1)
	random.seed(1);

	data = random.randn(d,n)

	# create shogun feature representation
	features=RealFeatures(data)

	# use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization
	# which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard
	# k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2
	kernel=GaussianKernel(10,8)
	kernel.init(features,features)

	kmm = KernelMeanMatching(kernel,array([0,1,2,3,7,8,9],dtype=int32),array([4,5,6],dtype=int32))
	w = kmm.compute_weights()
	#print w
	return w
示例#16
0
def kernel_chi2_modular(fm_train_real=traindat,
                        fm_test_real=testdat,
                        width=1.4,
                        size_cache=10):
    from shogun.Kernel import Chi2Kernel
    from shogun.Features import RealFeatures

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)

    kernel = Chi2Kernel(feats_train, feats_train, width, size_cache)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
示例#17
0
        def RunLinearRegressionShogun(q):
            totalTimer = Timer()

            # Load input dataset.
            # If the dataset contains two files then the second file is the responses
            # file.
            try:
                Log.Info("Loading dataset", self.verbose)
                if len(self.dataset) == 2:
                    X = np.genfromtxt(self.dataset[0], delimiter=',')
                    y = np.genfromtxt(self.dataset[1], delimiter=',')
                else:
                    X = np.genfromtxt(self.dataset, delimiter=',')
                    y = X[:, (X.shape[1] - 1)]
                    X = X[:, :-1]

                with totalTimer:
                    # Perform linear regression.
                    model = LeastSquaresRegression(RealFeatures(X.T),
                                                   RegressionLabels(y))
                    model.train()
                    b = model.get_w()
            except Exception as e:
                q.put(-1)
                return -1

            time = totalTimer.ElapsedTime()
            q.put(time)
            return time
示例#18
0
def so_multiclass(fm_train_real=traindat,
                  label_train_multiclass=label_traindat):
    try:
        from shogun.Features import RealFeatures
        from shogun.Loss import HingeLoss
        from shogun.Structure import MulticlassModel, MulticlassSOLabels, PrimalMosekSOSVM, RealNumber
    except ImportError:
        print("Mosek not available")
        return

    labels = MulticlassSOLabels(label_train_multiclass)
    features = RealFeatures(fm_train_real.T)

    model = MulticlassModel(features, labels)
    loss = HingeLoss()
    sosvm = PrimalMosekSOSVM(model, loss, labels)
    sosvm.train()

    out = sosvm.apply()
    count = 0
    for i in xrange(out.get_num_labels()):
        yi_pred = RealNumber.obtain_from_generic(out.get_label(i))
        if yi_pred.value == label_train_multiclass[i]:
            count = count + 1

    print("Correct classification rate: %0.2f" %
          (100.0 * count / out.get_num_labels()))
示例#19
0
def prepare_feats(desc, l=2, as_shogun=False):
    if l==2: desc = np.sqrt(desc) #bias not afected by sqrt

    norms = np.apply_along_axis(np.linalg.norm, 0, desc[:-1,:], l) #leave bias alone

    np.seterr(divide='ignore', invalid='ignore')

    desc[:-1,:]=desc[:-1,:]/norms #leave bias alone
    np.seterr(divide='warn', invalid='warn')

    if l==1: desc=desc[:-1,:] #removing bias dim if L1 -> nonlinear TODO find better way...

    desc[np.isnan(desc)]=0 #handle NaNs
    if as_shogun:
        desc=RealFeatures(desc.astype('float'))
    return desc
def preprocessor_kernelpca_modular(data, threshold, width):

    from shogun.Features import RealFeatures
    from shogun.Preprocessor import KernelPCA
    from shogun.Kernel import GaussianKernel
    features = RealFeatures(data)
    kernel = GaussianKernel(features, features, width)
    preprocessor = KernelPCA(kernel)
    preprocessor.init(features)
    preprocessor.set_target_dim(2)
    #X=preprocessor.get_transformation_matrix()
    X2 = preprocessor.apply_to_feature_matrix(features)
    lx0 = len(X2)
    modified_d1 = zeros((lx0, number_of_points_for_circle1))
    modified_d2 = zeros((lx0, number_of_points_for_circle2))
    modified_d1 = [X2[i][0:number_of_points_for_circle1] for i in range(lx0)]
    modified_d2 = [
        X2[i][number_of_points_for_circle1:(number_of_points_for_circle1 +
                                            number_of_points_for_circle2)]
        for i in range(lx0)
    ]
    p.plot(modified_d1[0][:], modified_d1[1][:], 'o', modified_d2[0][:],
           modified_d2[1][:], 'x')
    p.title('final data')
    p.show()
    return features
def distance_normsquared_modular (fm_train_real=traindat,fm_test_real=testdat):

	from shogun.Features import RealFeatures
	from shogun.Distance import EuclidianDistance

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	distance=EuclidianDistance(feats_train, feats_train)
	distance.set_disable_sqrt(True)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()

	return distance,dm_train,dm_test
示例#22
0
def converter_multidimensionalscaling_modular(data):
    try:
        from shogun.Features import RealFeatures
        from shogun.Converter import MultidimensionalScaling
        from shogun.Distance import EuclideanDistance

        features = RealFeatures(data)

        distance_before = EuclideanDistance()
        distance_before.init(features, features)

        converter = MultidimensionalScaling()
        converter.set_target_dim(2)
        converter.set_landmark(False)
        embedding = converter.apply(features)

        distance_after = EuclideanDistance()
        distance_after.init(embedding, embedding)

        distance_matrix_after = distance_after.get_distance_matrix()
        distance_matrix_before = distance_before.get_distance_matrix()

        return numpy.linalg.norm(distance_matrix_after - distance_matrix_before
                                 ) / numpy.linalg.norm(distance_matrix_before)
    except ImportError:
        print('No Eigen3 available')
示例#23
0
    def log_pdf(self, thetas):
        assert (len(shape(thetas)) == 2)
        assert (shape(thetas)[1] == self.dimension)

        result = zeros(len(thetas))
        for i in range(len(thetas)):
            labels = BinaryLabels(self.y)
            feats_train = RealFeatures(self.X.T)

            # ARD: set set theta, which is in log-scale, as kernel weights
            kernel = GaussianARDKernel(10, 1)
            kernel.set_weights(exp(thetas[i]))

            mean = ZeroMean()
            likelihood = LogitLikelihood()
            inference = LaplacianInferenceMethod(kernel, feats_train, mean,
                                                 labels, likelihood)

            # fix kernel scaling for now
            inference.set_scale(exp(0))

            if self.ridge is not None:
                log_ml_estimate = inference.get_marginal_likelihood_estimate(
                    self.n_importance, self.ridge)
            else:
                log_ml_estimate = inference.get_marginal_likelihood_estimate(
                    self.n_importance)

            # prior is also in log-domain, so no exp of theta
            log_prior = self.prior.log_pdf(thetas[i].reshape(
                1, len(thetas[i])))
            result[i] = log_ml_estimate + log_prior

        return result
示例#24
0
def kernel_tstudent_modular (fm_train_real=traindat,fm_test_real=testdat, degree=2.0):
	from shogun.Features import RealFeatures
	from shogun.Kernel import TStudentKernel
	from shogun.Distance import EuclidianDistance

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)
	
	distance=EuclidianDistance(feats_train, feats_train)

	kernel=TStudentKernel(feats_train, feats_train, degree, distance)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def classifier_conjugateindex_modular(fm_train_real=traindat,
                                      fm_test_real=testdat,
                                      label_train_multiclass=label_traindat):
    from shogun.Features import RealFeatures, MulticlassLabels
    from shogun.Classifier import ConjugateIndex

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)

    labels = MulticlassLabels(label_train_multiclass)

    ci = ConjugateIndex(feats_train, labels)
    ci.train()

    res = ci.apply(feats_test).get_labels()
    return ci, res
def kernel_distance_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.7):
	from shogun.Kernel import DistanceKernel
	from shogun.Features import RealFeatures
	from shogun.Distance import EuclidianDistance
	
	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)
	
	distance=EuclidianDistance()

	kernel=DistanceKernel(feats_train, feats_test, width, distance)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def classifier_multiclassmultipleoutputliblinear_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,label_test_multiclass=label_testdat,width=2.1,C=1,epsilon=1e-5):
	from shogun.Features import RealFeatures, MulticlassLabels, MulticlassMultipleOutputLabels
	from shogun.Classifier import MulticlassLibLinear

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	labels=MulticlassLabels(label_train_multiclass)

	classifier = MulticlassLibLinear(C,feats_train,labels)
	classifier.train()

	label_pred = classifier.apply_multiclass_multiple_output(feats_test,2)
	out = label_pred.get_labels()
	#print out
	return out
示例#28
0
def kernel_multiquadric_modular (fm_train_real=traindat,fm_test_real=testdat, shift_coef=1.0):
	from shogun.Features import RealFeatures
	from shogun.Kernel import MultiquadricKernel
	from shogun.Distance import EuclidianDistance

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)
	
	distance=EuclidianDistance(feats_train, feats_train)

	kernel=MultiquadricKernel(feats_train, feats_train, shift_coef, distance)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def modelselection_grid_search_linear_modular(traindat=traindat,
                                              label_traindat=label_traindat):
    from shogun.Evaluation import CrossValidation, CrossValidationResult
    from shogun.Evaluation import ContingencyTableEvaluation, ACCURACY
    from shogun.Evaluation import StratifiedCrossValidationSplitting
    from shogun.ModelSelection import GridSearchModelSelection
    from shogun.ModelSelection import ModelSelectionParameters, R_EXP
    from shogun.ModelSelection import ParameterCombination
    from shogun.Features import Labels
    from shogun.Features import RealFeatures
    from shogun.Classifier import LibLinear, L2R_L2LOSS_SVC

    # build parameter tree to select C1 and C2
    param_tree_root = ModelSelectionParameters()
    c1 = ModelSelectionParameters("C1")
    param_tree_root.append_child(c1)
    c1.build_values(-2.0, 2.0, R_EXP)

    c2 = ModelSelectionParameters("C2")
    param_tree_root.append_child(c2)
    c2.build_values(-2.0, 2.0, R_EXP)

    # training data
    features = RealFeatures(traindat)
    labels = Labels(label_traindat)

    # classifier
    classifier = LibLinear(L2R_L2LOSS_SVC)

    # print all parameter available for modelselection
    # Dont worry if yours is not included but, write to the mailing list
    classifier.print_modsel_params()

    # splitting strategy for cross-validation
    splitting_strategy = StratifiedCrossValidationSplitting(labels, 10)

    # evaluation method
    evaluation_criterium = ContingencyTableEvaluation(ACCURACY)

    # cross-validation instance
    cross_validation = CrossValidation(classifier, features, labels,
                                       splitting_strategy,
                                       evaluation_criterium)

    # model selection instance
    model_selection = GridSearchModelSelection(param_tree_root,
                                               cross_validation)

    # perform model selection with selected methods
    #print "performing model selection of"
    #param_tree_root.print_tree()
    best_parameters = model_selection.select_model()

    # print best parameters
    #print "best parameters:"
    #best_parameters.print_tree()

    # apply them and print result
    best_parameters.apply_to_machine(classifier)
    result = cross_validation.evaluate()
示例#30
0
        def RunLARSShogun(q):
            totalTimer = Timer()

            # Load input dataset.
            try:
                Log.Info("Loading dataset", self.verbose)
                inputData = np.genfromtxt(self.dataset[0], delimiter=',')
                responsesData = np.genfromtxt(self.dataset[1], delimiter=',')
                inputFeat = RealFeatures(inputData.T)
                responsesFeat = RegressionLabels(responsesData)

                # Get all the parameters.
                lambda1 = re.search("-l (\d+)", options)
                lambda1 = 0.0 if not lambda1 else int(lambda1.group(1))

                with totalTimer:
                    # Perform LARS.
                    model = LeastAngleRegression(False)
                    model.set_max_l1_norm(lambda1)
                    model.set_labels(responsesFeat)
                    model.train(inputFeat)
                    model.get_w(model.get_path_size() - 1)
            except Exception as e:
                q.put(-1)
                return -1

            time = totalTimer.ElapsedTime()
            q.put(time)
            return time
示例#31
0
def kernel_exponential_modular (fm_train_real=traindat,fm_test_real=testdat, tau_coef=1.0):
	from shogun.Features import RealFeatures
	from shogun.Kernel import ExponentialKernel
	from shogun.Distance import EuclidianDistance

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	distance = EuclidianDistance(feats_train, feats_train)
	
	kernel=ExponentialKernel(feats_train, feats_train, tau_coef, distance, 10)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
示例#32
0
            def RunKMeansShogun(q):
                import numpy as np
                from shogun.Distance import EuclideanDistance
                from shogun.Features import RealFeatures
                from shogun import Clustering
                from shogun.Mathematics import Math_init_random

                totalTimer = Timer()

                if seed:
                    Math_init_random(seed.group(1))
                try:
                    data = np.genfromtxt(self.dataset, delimiter=',')

                    dataFeat = RealFeatures(data.T)
                    distance = EuclideanDistance(dataFeat, dataFeat)

                    # Create the K-Means object and perform K-Means clustering.
                    with totalTimer:
                        model = Clustering.KMeans(int(clusters.group(1)),
                                                  distance)
                        model.set_max_iter(maxIterations)
                        model.train()

                        labels = model.apply().get_labels()
                        centers = model.get_cluster_centers()
                except Exception as e:
                    q.put(-1)
                    return -1

                time = totalTimer.ElapsedTime()
                q.put(time)
                return time
示例#33
0
        def RunGMMShogun(q):
            totalTimer = Timer()

            try:
                # Load input dataset.
                Log.Info("Loading dataset", self.verbose)
                dataPoints = np.genfromtxt(self.dataset, delimiter=',')
                dataFeat = RealFeatures(dataPoints.T)

                # Get all the parameters.
                g = re.search("-g (\d+)", options)
                n = re.search("-n (\d+)", options)
                s = re.search("-n (\d+)", options)

                g = 1 if not g else int(g.group(1))
                n = 250 if not n else int(n.group(1))

                # Create the Gaussian Mixture Model.
                model = Clustering.GMM(g)
                model.set_features(dataFeat)
                with totalTimer:
                    model.train_em(1e-9, n, 1e-9)
            except Exception as e:
                q.put(-1)
                return -1

            time = totalTimer.ElapsedTime()
            q.put(time)
            return time
示例#34
0
def kernel_cauchy_modular (fm_train_real=traindat,fm_test_real=testdat, sigma=1.0):
	from shogun.Features import RealFeatures
	from shogun.Kernel import CauchyKernel
	from shogun.Distance import EuclidianDistance

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)
	
	distance=EuclidianDistance(feats_train, feats_train)

	kernel=CauchyKernel(feats_train, feats_train, sigma, distance)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def kernel_wavelet_modular(fm_train_real=traindat,
                           fm_test_real=testdat,
                           dilation=1.5,
                           translation=1.0):
    from shogun.Features import RealFeatures
    from shogun.Kernel import WaveletKernel

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)

    kernel = WaveletKernel(feats_train, feats_train, 10, dilation, translation)
    km_train = kernel.get_kernel_matrix()

    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
示例#36
0
def create_features(examples, param):
    """
    factory for features
    
    @param examples: list/array of examples
    @type examples: list

    @return subclass of shogun Features object
    @rtype: Features
    """

    assert (len(examples) > 0)

    feat = None

    #TODO: refactor
    if param and param.flags.has_key(
            "svm_type") and param.flags["svm_type"] == "liblineardual":
        # create hashed promoter features
        return create_hashed_promoter_features(examples, param.flags)

    if param and param.kernel == "Promoter":
        print "creating promoter features"
        # create promoter features
        return create_promoter_features(examples, param.flags)

    #auto_detect string type
    if type(examples[0]) == str:

        # check what alphabet is used
        longstr = ""
        num_seqs = min(len(examples), 20)
        for i in range(num_seqs):
            longstr += examples[i]

        if len(set([letter for letter in longstr])) > 5:
            feat = StringCharFeatures(PROTEIN)
            if param and param.flags.has_key("debug"):
                print "FEATURES: StringCharFeatures(PROTEIN)"
        else:
            feat = StringCharFeatures(DNA)
            if param and param.flags.has_key("debug"):
                print "FEATURES: StringCharFeatures(DNA)"

        feat.set_features(examples)

    else:

        # assume real features
        examples = numpy.array(examples, dtype=numpy.float64)

        examples = numpy.transpose(examples)

        feat = RealFeatures(examples)

        if param and param.flags.has_key("debug"):
            print "FEATURES: RealFeatures"

    return feat
def classifier_knn_modular(fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat, k=3 ):
	from shogun.Features import RealFeatures, Labels
	from shogun.Classifier import KNN
	from shogun.Distance import EuclidianDistance

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)
	distance=EuclidianDistance(feats_train, feats_train)


	labels=Labels(label_train_multiclass)

	knn=KNN(k, distance, labels)
	knn_train = knn.train()
	output=knn.apply(feats_test).get_labels()
	multiple_k=knn.classify_for_multiple_k()
	return knn,knn_train,output,multiple_k
示例#38
0
def bench_shogun(X, y, T, valid):
#
#       .. Shogun ..
#
    from shogun.Classifier import LibSVM
    from shogun.Features import RealFeatures, Labels
    from shogun.Kernel import GaussianKernel
    start = datetime.now()
    feat = RealFeatures(X.T)
    feat_test = RealFeatures(T.T)
    labels = Labels(y.astype(np.float64))
    kernel = GaussianKernel(feat, feat, sigma)
    shogun_svm = LibSVM(1., kernel, labels)
    shogun_svm.train()
    dec_func = shogun_svm.classify(feat_test).get_labels()
    score = np.mean(np.sign(dec_func) == valid)
    return score, datetime.now() - start
def features_dense_real_modular(A=matrix):

    # ... of type Real, LongInt and Byte
    a = RealFeatures(A)

    # print(some statistics about a)
    # print(a.get_num_vectors())
    # print(a.get_num_features())

    # get first feature vector and set it
    # print(a.get_feature_vector(0))
    a.set_feature_vector(array([1, 4, 0, 0, 0, 9], dtype=float64), 0)

    # get matrix
    a_out = a.get_feature_matrix()

    assert all(a_out == A)
    return a_out
def features_director_dot_modular (fm_train_real, fm_test_real,
		label_train_twoclass, C, epsilon):

	from shogun.Features import RealFeatures, SparseRealFeatures, BinaryLabels
	from shogun.Classifier import LibLinear, L2R_L2LOSS_SVC_DUAL
	from shogun.Mathematics import Math_init_random
	Math_init_random(17)

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)
	labels=BinaryLabels(label_train_twoclass)

	dfeats_train=NumpyFeatures(fm_train_real)
	dfeats_test=NumpyFeatures(fm_test_real)
	dlabels=BinaryLabels(label_train_twoclass)

	print feats_train.get_computed_dot_feature_matrix()
	print dfeats_train.get_computed_dot_feature_matrix()

	svm=LibLinear(C, feats_train, labels)
	svm.set_liblinear_solver_type(L2R_L2LOSS_SVC_DUAL)
	svm.set_epsilon(epsilon)
	svm.set_bias_enabled(True)
	svm.train()

	svm.set_features(feats_test)
	svm.apply().get_labels()
	predictions = svm.apply()

	dfeats_train.__disown__()
	dfeats_train.parallel.set_num_threads(1)
	dsvm=LibLinear(C, dfeats_train, dlabels)
	dsvm.set_liblinear_solver_type(L2R_L2LOSS_SVC_DUAL)
	dsvm.set_epsilon(epsilon)
	dsvm.set_bias_enabled(True)
	dsvm.train()

	dfeats_test.__disown__()
	dfeats_test.parallel.set_num_threads(1)
	dsvm.set_features(dfeats_test)
	dsvm.apply().get_labels()
	dpredictions = dsvm.apply()

	return predictions, svm, predictions.get_labels()
示例#41
0
def kernel_anova_modular (fm_train_real=traindat,fm_test_real=testdat,cardinality=2, size_cache=10):
	from shogun.Kernel import ANOVAKernel
	from shogun.Features import RealFeatures
	
	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)
	
	kernel=ANOVAKernel(feats_train, feats_train, cardinality, size_cache)
        
	for i in range(0,feats_train.get_num_vectors()):
		for j in range(0,feats_train.get_num_vectors()):
			k1 = kernel.compute_rec1(i,j)
			k2 = kernel.compute_rec2(i,j)
			#if abs(k1-k2) > 1e-10:
			#	print "|%s|%s|" % (k1, k2)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train, km_test, kernel
def preproc_prunevarsubmean_modular(fm_train_real=traindat, fm_test_real=testdat, width=1.4, size_cache=10):
    from shogun.Kernel import Chi2Kernel
    from shogun.Features import RealFeatures
    from shogun.PreProc import PruneVarSubMean

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)

    preproc = PruneVarSubMean()
    preproc.init(feats_train)
    feats_train.add_preproc(preproc)
    feats_train.apply_preproc()
    feats_test.add_preproc(preproc)
    feats_test.apply_preproc()

    kernel = Chi2Kernel(feats_train, feats_train, width, size_cache)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()

    return km_train, km_test, kernel
def preprocessor_randomfouriergausspreproc_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10):
	from shogun.Kernel import Chi2Kernel
	from shogun.Features import RealFeatures
	from shogun.Preprocessor import RandomFourierGaussPreproc

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	preproc=RandomFourierGaussPreproc()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

	return km_train,km_test,kernel
def preprocessor_normone_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10):

	from shogun.Kernel import Chi2Kernel
	from shogun.Features import RealFeatures
	from shogun.Preprocessor import NormOne

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	preprocessor=NormOne()
	preprocessor.init(feats_train)
	feats_train.add_preprocessor(preprocessor)
	feats_train.apply_preprocessor()
	feats_test.add_preprocessor(preprocessor)
	feats_test.apply_preprocessor()

	kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

	return km_train,km_test,kernel
def preproc_logplusone_modular(fm_train_real=traindat, fm_test_real=testdat, width=1.4, size_cache=10):

    from shogun.Kernel import Chi2Kernel
    from shogun.Features import RealFeatures
    from shogun.PreProc import LogPlusOne

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)

    preproc = LogPlusOne()
    preproc.init(feats_train)
    feats_train.add_preproc(preproc)
    feats_train.apply_preproc()
    feats_test.add_preproc(preproc)
    feats_test.apply_preproc()

    kernel = Chi2Kernel(feats_train, feats_train, width, size_cache)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()

    return km_train, km_test, kernel
from shogun.Features import RealFeatures, LongIntFeatures, ByteFeatures
from numpy import array, float64, int64, uint8, all

# create dense matrices A,B,C
A=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=float64)
B=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=int64)
C=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=uint8)

# ... of type Real, LongInt and Byte
a=RealFeatures(A)
b=LongIntFeatures(B)
c=ByteFeatures(C)

# or 16bit wide ...
#feat1 = f.ShortFeatures(N.zeros((10,5),N.short))
#feat2 = f.WordFeatures(N.zeros((10,5),N.uint16))


# print some statistics about a
print a.get_num_vectors()
print a.get_num_features()

# get first feature vector and set it
print a.get_feature_vector(0)
a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0)

# get matrices
a_out = a.get_feature_matrix()
b_out = b.get_feature_matrix()
c_out = c.get_feature_matrix()
def serialization_complex_example(num=5, dist=1, dim=10, C=2.0, width=10):
	import os
	from numpy import concatenate, zeros, ones
	from numpy.random import randn, seed
	from shogun.Features import RealFeatures, Labels
	from shogun.Classifier import GMNPSVM
	from shogun.Kernel import GaussianKernel
	from shogun.IO import SerializableHdf5File,SerializableAsciiFile, \
			SerializableJsonFile,SerializableXmlFile,MSG_DEBUG
	from shogun.Preprocessor import NormOne, LogPlusOne

	seed(17)

	data=concatenate((randn(dim, num), randn(dim, num) + dist,
					  randn(dim, num) + 2*dist,
					  randn(dim, num) + 3*dist), axis=1)
	lab=concatenate((zeros(num), ones(num), 2*ones(num), 3*ones(num)))

	feats=RealFeatures(data)
	#feats.io.set_loglevel(MSG_DEBUG)
	kernel=GaussianKernel(feats, feats, width)

	labels=Labels(lab)

	svm = GMNPSVM(C, kernel, labels)

	feats.add_preprocessor(NormOne())
	feats.add_preprocessor(LogPlusOne())
	feats.set_preprocessed(1)
	svm.train(feats)

	#svm.print_serializable()

	fstream = SerializableHdf5File("blaah.h5", "w")
	status = svm.save_serializable(fstream)
	check_status(status)

	fstream = SerializableAsciiFile("blaah.asc", "w")
	status = svm.save_serializable(fstream)
	check_status(status)

	fstream = SerializableJsonFile("blaah.json", "w")
	status = svm.save_serializable(fstream)
	check_status(status)

	fstream = SerializableXmlFile("blaah.xml", "w")
	status = svm.save_serializable(fstream)
	check_status(status)


	fstream = SerializableHdf5File("blaah.h5", "r")
	new_svm=GMNPSVM()
	status = new_svm.load_serializable(fstream)
	check_status(status)
	new_svm.train()

	fstream = SerializableAsciiFile("blaah.asc", "r")
	new_svm=GMNPSVM()
	status = new_svm.load_serializable(fstream)
	check_status(status)
	new_svm.train()

	fstream = SerializableJsonFile("blaah.json", "r")
	new_svm=GMNPSVM()
	status = new_svm.load_serializable(fstream)
	check_status(status)
	new_svm.train()

	fstream = SerializableXmlFile("blaah.xml", "r")
	new_svm=GMNPSVM()
	status = new_svm.load_serializable(fstream)
	check_status(status)
	new_svm.train()

	os.unlink("blaah.h5")
	os.unlink("blaah.asc")
	os.unlink("blaah.json")
	os.unlink("blaah.xml")
	return svm,new_svm
示例#48
0
def create_features(kname, examples, kparam, train_mode, preproc, seq_source, nuc_con):
    """Converts numpy arrays or sequences into shogun features"""

    if kname == 'gauss' or kname == 'linear' or kname == 'poly':
        examples = numpy.array(examples)
        feats = RealFeatures(examples)
        
    elif kname == 'wd' or kname == 'localalign' or kname == 'localimprove':
        if seq_source == 'dna': 
            examples = non_atcg_convert(examples, nuc_con)
            feats = StringCharFeatures(examples, DNA)
        elif seq_source == 'protein':
            examples = non_aminoacid_converter(examples, nuc_con) 
            feats = StringCharFeatures(examples, PROTEIN)
        else:
            sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n")
            sys.exit(-1)

    elif kname == 'spec' or kname == 'cumspec':
        if seq_source == 'dna':
            examples = non_atcg_convert(examples, nuc_con)
            feats = StringCharFeatures(examples, DNA) 
        elif seq_source == 'protein':    
            examples = non_aminoacid_converter(examples, nuc_con)
            feats = StringCharFeatures(examples, PROTEIN)
        else:
            sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n")
            sys.exit(-1)
       
        wf = StringUlongFeatures( feats.get_alphabet() )
        wf.obtain_from_char(feats, kparam['degree']-1, kparam['degree'], 0, kname=='cumspec')
        del feats

        if train_mode:
            preproc = SortUlongString()
            preproc.init(wf)
        wf.add_preproc(preproc)
        ret = wf.apply_preproc()
        #assert(ret)

        feats = wf
    elif kname == 'spec2' or kname == 'cumspec2':
        # spectrum kernel on two sequences
        feats = {}
        feats['combined'] = CombinedFeatures()

        reversed = kname=='cumspec2'

        (ex0,ex1) = zip(*examples)

        f0 = StringCharFeatures(list(ex0), DNA)
        wf = StringWordFeatures(f0.get_alphabet())
        wf.obtain_from_char(f0, kparam['degree']-1, kparam['degree'], 0, reversed)
        del f0

        if train_mode:
            preproc = SortWordString()
            preproc.init(wf)
        wf.add_preprocessor(preproc)
        ret = wf.apply_preprocessors()
        assert(ret)
        feats['combined'].append_feature_obj(wf)
        feats['f0'] = wf

        f1 = StringCharFeatures(list(ex1), DNA)
        wf = StringWordFeatures( f1.get_alphabet() )
        wf.obtain_from_char(f1, kparam['degree']-1, kparam['degree'], 0, reversed)
        del f1

        if train_mode:
            preproc = SortWordString()
            preproc.init(wf)
        wf.add_preproc(preproc)
        ret = wf.apply_preproc()
        assert(ret)
        feats['combined'].append_feature_obj(wf)
        feats['f1'] = wf

    else:
        print 'Unknown kernel %s' % kname
    
    return (feats,preproc)
示例#49
0
# parameters, change to get different results
m=250
difference=3

# setting the angle lower makes a harder test
angle=pi/30

# number of samples taken from null and alternative distribution
num_null_samples=500

# use data generator class to produce example data
data=DataGenerator.generate_sym_mix_gauss(m,difference,angle)

# create shogun feature representation
features_x=RealFeatures(array([data[0]]))
features_y=RealFeatures(array([data[1]]))

# use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization
# which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard
# k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2
# Note that kernels per data can be different
kernel_x=GaussianKernel(10,8)
kernel_y=GaussianKernel(10,8)

# create hsic instance. Note that this is a convienience constructor which copies
# feature data. features_x and features_y are not these used in hsic.
# This is only for user-friendlyness. Usually, its ok to do this.
# Below, the alternative distribution is sampled, which means
# that new feature objects have to be created in each iteration (slow)
# However, normally, the alternative distribution is not sampled
示例#50
0
def statistics_hsic (n, difference, angle):
	from shogun.Features import RealFeatures
	from shogun.Features import DataGenerator
	from shogun.Kernel import GaussianKernel
	from shogun.Statistics import HSIC
	from shogun.Statistics import BOOTSTRAP, HSIC_GAMMA
	from shogun.Distance import EuclideanDistance
	from shogun.Mathematics import Math, Statistics, IntVector
	
	# init seed for reproducability
	Math.init_random(1)

	# note that the HSIC has to store kernel matrices
	# which upper bounds the sample size

	# use data generator class to produce example data
	data=DataGenerator.generate_sym_mix_gauss(n,difference,angle)
	#plot(data[0], data[1], 'x');show()

	# create shogun feature representation
	features_x=RealFeatures(array([data[0]]))
	features_y=RealFeatures(array([data[1]]))

	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	subset=IntVector.randperm_vec(features_x.get_num_vectors())
	subset=subset[0:200]
	features_x.add_subset(subset)
	dist=EuclideanDistance(features_x, features_x)
	distances=dist.get_distance_matrix()
	features_x.remove_subset()
	median_distance=Statistics.matrix_median(distances, True)
	sigma_x=median_distance**2
	features_y.add_subset(subset)
	dist=EuclideanDistance(features_y, features_y)
	distances=dist.get_distance_matrix()
	features_y.remove_subset()
	median_distance=Statistics.matrix_median(distances, True)
	sigma_y=median_distance**2
	#print "median distance for Gaussian kernel on x:", sigma_x
	#print "median distance for Gaussian kernel on y:", sigma_y
	kernel_x=GaussianKernel(10,sigma_x)
	kernel_y=GaussianKernel(10,sigma_y)

	hsic=HSIC(kernel_x,kernel_y,features_x,features_y)

	# perform test: compute p-value and test if null-hypothesis is rejected for
	# a test level of 0.05 using different methods to approximate
	# null-distribution
	statistic=hsic.compute_statistic()
	#print "HSIC:", statistic
	alpha=0.05

	#print "computing p-value using bootstrapping"
	hsic.set_null_approximation_method(BOOTSTRAP)
	# normally, at least 250 iterations should be done, but that takes long
	hsic.set_bootstrap_iterations(100)
	# bootstrapping allows usage of unbiased or biased statistic
	p_value_boot=hsic.compute_p_value(statistic)
	thresh_boot=hsic.compute_threshold(alpha)
	#print "p_value:", p_value_boot
	#print "threshold for 0.05 alpha:", thresh_boot
	#print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha

	#print "computing p-value using gamma method"
	hsic.set_null_approximation_method(HSIC_GAMMA)
	p_value_gamma=hsic.compute_p_value(statistic)
	thresh_gamma=hsic.compute_threshold(alpha)
	#print "p_value:", p_value_gamma
	#print "threshold for 0.05 alpha:", thresh_gamma
	#print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value_gamma<alpha

	# sample from null distribution (these may be plotted or whatsoever)
	# mean should be close to zero, variance stronly depends on data/kernel
	# bootstrapping, biased statistic
	#print "sampling null distribution using bootstrapping"
	hsic.set_null_approximation_method(BOOTSTRAP)
	hsic.set_bootstrap_iterations(100)
	null_samples=hsic.bootstrap_null()
	#print "null mean:", mean(null_samples)
	#print "null variance:", var(null_samples)
	#hist(null_samples, 100); show()
	
	return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples
def features_dense_protocols_modular(in_data=data):
	m_real=array(in_data, dtype=float64, order='F')
	f_real=RealFeatures(m_real)

	print m_real
	print f_real

	print f_real[-1]
	print f_real[1, 2]
	print f_real[-1:3]
	print f_real[2, 0:2]
	print f_real[0:3, 1]
	print f_real[0:3, 1:2]
	print f_real[:,1]
	print f_real[1,:]

	print m_real[-2]
	f_real[-1]=m_real[-2]
	print f_real[-1]

	print m_real[0, 1]
	f_real[1,2]=m_real[0,1]
	print f_real[1, 2]

	print m_real[0:2]
	f_real[1:3]=m_real[0:2]
	print f_real[1:3]

	print m_real[0, 0:2]
	f_real[2, 0:2]=m_real[0,0:2]
	print f_real[2, 0:2]

	print m_real[0:3, 2]
	f_real[0:3,1]=m_real[0:3, 2]
	print f_real[0:3, 1]

	print m_real[0:3, 0:1]
	f_real[0:3,1:2]=m_real[0:3,0:1]
	print f_real[0:3, 1:2]

	f_real[:,0]=0
	print f_real.get_feature_matrix()

	if numpy.__version__ >= '1.5':
		f_real+=m_real
		f_real*=m_real
		f_real-=m_real
	else:
		print "numpy version >= 1.5 is needed"
		return None

	f_real+=f_real
	f_real*=f_real
	f_real-=f_real

	print f_real
	print f_real.get_feature_matrix()

	try:
		mem_real=memoryview(f_real)
	except NameError:
		print "Python2.7 is needed for memoryview class"
		return None

	ret_real=array(f_real)
	print ret_real

	return f_real[:,0]
示例#52
0
			np.dot(np.random.randn(N, dim), covs[1]) + np.array([-10, -10]),
			np.dot(np.random.randn(N, dim), covs[2]) + np.array([10, -10])];
	Y = np.hstack((np.zeros(N), np.ones(N), 2*np.ones(N)))
	return X, Y

# Number of classes
M = 3
# Number of samples of each class
N = 50
# Dimension of the data
dim = 2

X, y = gen_data()

labels = MulticlassSOLabels(y)
features = RealFeatures(X.T)

model = MulticlassModel(features, labels)
loss = HingeLoss()

risk = MulticlassRiskFunction()

risk_data = MulticlassRiskData(features, labels, model.get_dim(), features.get_num_vectors())

lambda_ = 1e3
sosvm = DualLibQPBMSOSVM(model, loss, labels, features, lambda_, risk, risk_data)

sosvm.set_cleanAfter(10)	# number of iterations that cutting plane has to be inactive for to be removed
sosvm.set_cleanICP(True)	# enables inactive cutting plane removal feature
sosvm.set_TolRel(0.001)		# set relative tolerance
sosvm.set_verbose(True)		# enables verbosity of the solver
# parameters, change to get different results
m=1000 # set to 10000 for a good test result
dim=2

# setting the difference of the first dimension smaller makes a harder test
difference=1

# number of samples taken from null and alternative distribution
num_null_samples=500

# use data generator class to produce example data
data=DataGenerator.generate_mean_data(m,dim,difference)

# create shogun feature representation
features=RealFeatures(data)

# compute median data distance in order to use for Gaussian kernel width
# 0.5*median_distance normally (factor two in Gaussian kernel)
# However, shoguns kernel width is different to usual parametrization
# Therefore 0.5*2*median_distance^2
# Use a subset of data for that, only 200 elements. Median is stable
# Using all distances here would blow up memory
subset=Math.randperm_vec(features.get_num_vectors())
subset=subset[0:200]
features.add_subset(subset)
dist=EuclideanDistance(features, features)
distances=dist.get_distance_matrix()
features.remove_subset()
median_distance=Statistics.matrix_median(distances, True)
sigma=median_distance**2
def features_director_dot_modular (fm_train_real, fm_test_real,
		label_train_twoclass, C, epsilon):
	try:
		from shogun.Features import DirectorDotFeatures
		from shogun.Library import RealVector
	except ImportError:
		print "recompile shogun with --enable-swig-directors"
		return

	class NumpyFeatures(DirectorDotFeatures):

		# variables
		data=numpy.empty((1,1))
		
		# constructor
		def __init__(self, d):
			DirectorDotFeatures.__init__(self)
			self.data = d
		
		# overloaded methods
		def add_to_dense_sgvec(self, alpha, vec_idx1, vec2, abs):
			if abs:
				vec2+=alpha*numpy.abs(self.data[:,vec_idx1])
			else:
				vec2+=alpha*self.data[:,vec_idx1]

		def dot(self, vec_idx1, df, vec_idx2):
			return numpy.dot(self.data[:,vec_idx1], df.get_computed_dot_feature_vector(vec_idx2))

		def dense_dot_sgvec(self, vec_idx1, vec2):
			return numpy.dot(self.data[:,vec_idx1], vec2[0:vec2.vlen])

		def get_num_vectors(self):
			return self.data.shape[1]

		def get_dim_feature_space(self):
			return self.data.shape[0]

		# operators
	#	def __add__(self, other):
	#		return NumpyFeatures(self.data+other.data)

	#	def __sub__(self, other):
	#		return NumpyFeatures(self.data-other.data)

	#	def __iadd__(self, other):
	#		return NumpyFeatures(self.data+other.data)

	#	def __isub__(self, other):
	#		return NumpyFeatures(self.data-other.data)


	from shogun.Features import RealFeatures, SparseRealFeatures, BinaryLabels
	from shogun.Classifier import LibLinear, L2R_L2LOSS_SVC_DUAL
	from shogun.Mathematics import Math_init_random
	Math_init_random(17)

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)
	labels=BinaryLabels(label_train_twoclass)

	dfeats_train=NumpyFeatures(fm_train_real)
	dfeats_test=NumpyFeatures(fm_test_real)
	dlabels=BinaryLabels(label_train_twoclass)

	print feats_train.get_computed_dot_feature_matrix()
	print dfeats_train.get_computed_dot_feature_matrix()

	svm=LibLinear(C, feats_train, labels)
	svm.set_liblinear_solver_type(L2R_L2LOSS_SVC_DUAL)
	svm.set_epsilon(epsilon)
	svm.set_bias_enabled(True)
	svm.train()

	svm.set_features(feats_test)
	svm.apply().get_labels()
	predictions = svm.apply()

	dfeats_train.__disown__()
	dfeats_train.parallel.set_num_threads(1)
	dsvm=LibLinear(C, dfeats_train, dlabels)
	dsvm.set_liblinear_solver_type(L2R_L2LOSS_SVC_DUAL)
	dsvm.set_epsilon(epsilon)
	dsvm.set_bias_enabled(True)
	dsvm.train()

	dfeats_test.__disown__()
	dfeats_test.parallel.set_num_threads(1)
	dsvm.set_features(dfeats_test)
	dsvm.apply().get_labels()
	dpredictions = dsvm.apply()

	return predictions, svm, predictions.get_labels()
def statistics_linear_time_mmd ():
	from shogun.Features import RealFeatures
	from shogun.Features import DataGenerator
	from shogun.Kernel import GaussianKernel
	from shogun.Statistics import LinearTimeMMD
	from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN
	from shogun.Distance import EuclideanDistance
	from shogun.Mathematics import Statistics, Math

	# note that the linear time statistic is designed for much larger datasets
	n=10000
	dim=2
	difference=0.5

	# use data generator class to produce example data
	# in pratice, this generate data function could be replaced by a method
	# that obtains data from a stream
	data=DataGenerator.generate_mean_data(n,dim,difference)
	
	print "dimension means of X", mean(data.T[0:n].T)
	print "dimension means of Y", mean(data.T[n:2*n+1].T)

	# create shogun feature representation
	features=RealFeatures(data)

	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	# Using all distances here would blow up memory
	subset=Math.randperm_vec(features.get_num_vectors())
	subset=subset[0:200]
	features.add_subset(subset)
	dist=EuclideanDistance(features, features)
	distances=dist.get_distance_matrix()
	features.remove_subset()
	median_distance=Statistics.matrix_median(distances, True)
	sigma=median_distance**2
	print "median distance for Gaussian kernel:", sigma
	kernel=GaussianKernel(10,sigma)

	mmd=LinearTimeMMD(kernel,features, n)

	# perform test: compute p-value and test if null-hypothesis is rejected for
	# a test level of 0.05
	statistic=mmd.compute_statistic()
	print "test statistic:", statistic
	
	# do the same thing using two different way to approximate null-dstribution
	# bootstrapping and gaussian approximation (ony for really large samples)
	alpha=0.05

	print "computing p-value using bootstrapping"
	mmd.set_null_approximation_method(BOOTSTRAP)
	mmd.set_bootstrap_iterations(50) # normally, far more iterations are needed
	p_value=mmd.compute_p_value(statistic)
	print "p_value:", p_value
	print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
	
	print "computing p-value using gaussian approximation"
	mmd.set_null_approximation_method(MMD1_GAUSSIAN)
	p_value=mmd.compute_p_value(statistic)
	print "p_value:", p_value
	print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
	
	# sample from null distribution (these may be plotted or whatsoever)
	# mean should be close to zero, variance stronly depends on data/kernel
	mmd.set_null_approximation_method(BOOTSTRAP)
	mmd.set_bootstrap_iterations(10) # normally, far more iterations are needed
	null_samples=mmd.bootstrap_null()
	print "null mean:", mean(null_samples)
	print "null variance:", var(null_samples)
# parameters, change to get different results
m=100
dim=2

# setting the difference of the first dimension smaller makes a harder test
difference=0.5

# number of samples taken from null and alternative distribution
num_null_samples=500

# use data generator class to produce example data
data=DataGenerator.generate_mean_data(m,dim,difference)

# create shogun feature representation
features=RealFeatures(data)

# use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization
# which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard
# k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2
kernel=GaussianKernel(10,8)

# use biased statistic
mmd=QuadraticTimeMMD(kernel,features, m)
mmd.set_statistic_type(BIASED)

# sample alternative distribution
alt_samples=zeros(num_null_samples)
for i in range(len(alt_samples)):
	data=DataGenerator.generate_mean_data(m,dim,difference)
	features.set_feature_matrix(data)
示例#57
0
def hsic_graphical():
	# parameters, change to get different results
	m=250
	difference=3
	
	# setting the angle lower makes a harder test
	angle=pi/30
	
	# number of samples taken from null and alternative distribution
	num_null_samples=500
	
	# use data generator class to produce example data
	data=DataGenerator.generate_sym_mix_gauss(m,difference,angle)
	
	# create shogun feature representation
	features_x=RealFeatures(array([data[0]]))
	features_y=RealFeatures(array([data[1]]))
	
	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	subset=int32(array([x for x in range(features_x.get_num_vectors())])) # numpy
	subset=random.permutation(subset) # numpy permutation
	subset=subset[0:200]
	features_x.add_subset(subset)
	dist=EuclideanDistance(features_x, features_x)
	distances=dist.get_distance_matrix()
	features_x.remove_subset()
	median_distance=Statistics.matrix_median(distances, True)
	sigma_x=median_distance**2
	features_y.add_subset(subset)
	dist=EuclideanDistance(features_y, features_y)
	distances=dist.get_distance_matrix()
	features_y.remove_subset()
	median_distance=Statistics.matrix_median(distances, True)
	sigma_y=median_distance**2
	print "median distance for Gaussian kernel on x:", sigma_x
	print "median distance for Gaussian kernel on y:", sigma_y
	kernel_x=GaussianKernel(10,sigma_x)
	kernel_y=GaussianKernel(10,sigma_y)
	
	# create hsic instance. Note that this is a convienience constructor which copies
	# feature data. features_x and features_y are not these used in hsic.
	# This is only for user-friendlyness. Usually, its ok to do this.
	# Below, the alternative distribution is sampled, which means
	# that new feature objects have to be created in each iteration (slow)
	# However, normally, the alternative distribution is not sampled
	hsic=HSIC(kernel_x,kernel_y,features_x,features_y)
	
	# sample alternative distribution
	alt_samples=zeros(num_null_samples)
	for i in range(len(alt_samples)):
		data=DataGenerator.generate_sym_mix_gauss(m,difference,angle)
		features_x.set_feature_matrix(array([data[0]]))
		features_y.set_feature_matrix(array([data[1]]))
		
		# re-create hsic instance everytime since feature objects are copied due to
		# useage of convienience constructor
		hsic=HSIC(kernel_x,kernel_y,features_x,features_y)
		alt_samples[i]=hsic.compute_statistic()
	
	# sample from null distribution
	# bootstrapping, biased statistic
	hsic.set_null_approximation_method(BOOTSTRAP)
	hsic.set_bootstrap_iterations(num_null_samples)
	null_samples_boot=hsic.bootstrap_null()
	
	# fit gamma distribution, biased statistic
	hsic.set_null_approximation_method(HSIC_GAMMA)
	gamma_params=hsic.fit_null_gamma()
	# sample gamma with parameters
	null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)])
	
	# plot
	figure()
	
	# plot data x and y
	subplot(2,2,1)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks
	grid(True)
	plot(data[0], data[1], 'o')
	title('Data, rotation=$\pi$/'+str(1/angle*pi)+'\nm='+str(m))
	xlabel('$x$')
	ylabel('$y$')
	
	# compute threshold for test level
	alpha=0.05
	null_samples_boot.sort()
	null_samples_gamma.sort()
	thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))];
	thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))];
	
	type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples)
	type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples)
	
	# plot alternative distribution with threshold
	subplot(2,2,2)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	grid(True)
	hist(alt_samples, 20, normed=True);
	axvline(thresh_boot, 0, 1, linewidth=2, color='red')
	type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples)
	title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error))
	
	# compute range for all null distribution histograms
	hist_range=[min([min(null_samples_boot), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_gamma)])]
	
	# plot null distribution with threshold
	subplot(2,2,3)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	grid(True)
	hist(null_samples_boot, 20, range=hist_range, normed=True);
	axvline(thresh_boot, 0, 1, linewidth=2, color='red')
	title('Bootstrapped Null Dist.\n' + 'Type I error is '  + str(type_one_error_boot))
	
	# plot null distribution gamma
	subplot(2,2,4)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	grid(True)
	hist(null_samples_gamma, 20, range=hist_range, normed=True);
	axvline(thresh_gamma, 0, 1, linewidth=2, color='red')
	title('Null Dist. Gamma\nType I error is '  + str(type_one_error_gamma))
	grid(True)
	
	# pull plots a bit apart
	subplots_adjust(hspace=0.5)
	subplots_adjust(wspace=0.5)
示例#58
0
# parameters, change to get different results
m=250
difference=3

# setting the angle lower makes a harder test
angle=pi/30

# number of samples taken from null and alternative distribution
num_null_samples=500

# use data generator class to produce example data
data=DataGenerator.generate_sym_mix_gauss(m,difference,angle)

# create shogun feature representation
features_x=RealFeatures(array([data[0]]))
features_y=RealFeatures(array([data[1]]))

# compute median data distance in order to use for Gaussian kernel width
# 0.5*median_distance normally (factor two in Gaussian kernel)
# However, shoguns kernel width is different to usual parametrization
# Therefore 0.5*2*median_distance^2
# Use a subset of data for that, only 200 elements. Median is stable
subset=Math.randperm_vec(features_x.get_num_vectors())
subset=subset[0:200]
features_x.add_subset(subset)
dist=EuclideanDistance(features_x, features_x)
distances=dist.get_distance_matrix()
features_x.remove_subset()
median_distance=Statistics.matrix_median(distances, True)
sigma_x=median_distance**2
def statistics_quadratic_time_mmd ():
	from shogun.Features import RealFeatures
	from shogun.Features import DataGenerator
	from shogun.Kernel import GaussianKernel
	from shogun.Statistics import QuadraticTimeMMD
	from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED
	from shogun.Distance import EuclideanDistance
	from shogun.Mathematics import Statistics, Math

	# note that the quadratic time mmd has to store kernel matrices
	# which upper bounds the sample size
	n=500
	dim=2
	difference=0.5

	# use data generator class to produce example data
	data=DataGenerator.generate_mean_data(n,dim,difference)
	
	print "dimension means of X", mean(data.T[0:n].T)
	print "dimension means of Y", mean(data.T[n:2*n+1].T)

	# create shogun feature representation
	features=RealFeatures(data)

	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	subset=Math.randperm_vec(features.get_num_vectors())
	subset=subset[0:200]
	features.add_subset(subset)
	dist=EuclideanDistance(features, features)
	distances=dist.get_distance_matrix()
	features.remove_subset()
	median_distance=Statistics.matrix_median(distances, True)
	sigma=median_distance**2
	print "median distance for Gaussian kernel:", sigma
	kernel=GaussianKernel(10,sigma)

	mmd=QuadraticTimeMMD(kernel,features, n)

	# perform test: compute p-value and test if null-hypothesis is rejected for
	# a test level of 0.05 using different methods to approximate
	# null-distribution
	statistic=mmd.compute_statistic()
	alpha=0.05
	
	print "computing p-value using bootstrapping"
	mmd.set_null_approximation_method(BOOTSTRAP)
	# normally, at least 250 iterations should be done, but that takes long
	mmd.set_bootstrap_iterations(10)
	# bootstrapping allows usage of unbiased or biased statistic
	mmd.set_statistic_type(UNBIASED)
	p_value=mmd.compute_p_value(statistic)
	print "p_value:", p_value
	print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
	
	# only can do this if SHOGUN was compiled with LAPACK so check
	if "sample_null_spectrum" in dir(QuadraticTimeMMD):
		print "computing p-value using spectrum method"
		mmd.set_null_approximation_method(MMD2_SPECTRUM)
		# normally, at least 250 iterations should be done, but that takes long
		mmd.set_num_samples_sepctrum(50)
		mmd.set_num_eigenvalues_spectrum(n-10)
		# spectrum method computes p-value for biased statistics only
		mmd.set_statistic_type(BIASED)
		p_value=mmd.compute_p_value(statistic)
		print "p_value:", p_value
		print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
	
	print "computing p-value using gamma method"
	mmd.set_null_approximation_method(MMD2_GAMMA)
	# gamma method computes p-value for biased statistics only
	mmd.set_statistic_type(BIASED)
	p_value=mmd.compute_p_value(statistic)
	print "p_value:", p_value
	print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
	
	# sample from null distribution (these may be plotted or whatsoever)
	# mean should be close to zero, variance stronly depends on data/kernel
	# bootstrapping, biased statistic
	print "sampling null distribution using bootstrapping"
	mmd.set_null_approximation_method(BOOTSTRAP)
	mmd.set_statistic_type(BIASED)
	mmd.set_bootstrap_iterations(10)
	null_samples=mmd.bootstrap_null()
	print "null mean:", mean(null_samples)
	print "null variance:", var(null_samples)
	
	# sample from null distribution (these may be plotted or whatsoever)
	# mean should be close to zero, variance stronly depends on data/kernel
	# spectrum, biased statistic
	print "sampling null distribution using spectrum method"
	mmd.set_null_approximation_method(MMD2_SPECTRUM)
	mmd.set_statistic_type(BIASED)
	# 200 samples using 100 eigenvalues
	null_samples=mmd.sample_null_spectrum(50,10)
	print "null mean:", mean(null_samples)
	print "null variance:", var(null_samples)