def transfer_multitask_l12_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat):
	from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup
	try:
		from modshogun import MultitaskL12LogisticRegression
	except ImportError:
		print("MultitaskL12LogisticRegression not available")
		exit(0)

	features = RealFeatures(hstack((traindat,traindat)))
	labels = BinaryLabels(hstack((label_train,label_train)))

	n_vectors = features.get_num_vectors()
	task_one = Task(0,n_vectors//2)
	task_two = Task(n_vectors//2,n_vectors)
	task_group = TaskGroup()
	task_group.append_task(task_one)
	task_group.append_task(task_two)

	mtlr = MultitaskL12LogisticRegression(0.1,0.1,features,labels,task_group)
	mtlr.set_tolerance(1e-2) # use 1e-2 tolerance
	mtlr.set_max_iter(10)
	mtlr.train()
	mtlr.set_current_task(0)
	out = mtlr.apply_regression().get_labels()

	return out
def features_dense_modular (A=matrixA,B=matrixB,C=matrixC):

    a=RealFeatures(A)
    b=LongIntFeatures(B)
    c=ByteFeatures(C)

# or 16bit wide ...
#feat1 = f.ShortFeatures(N.zeros((10,5),N.short))
#feat2 = f.WordFeatures(N.zeros((10,5),N.uint16))


# print(some statistics about a)

# get first feature vector and set it

    a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0)

# get matrices
    a_out = a.get_feature_matrix()
    b_out = b.get_feature_matrix()
    c_out = c.get_feature_matrix()

    assert(all(a_out==A))

    assert(all(b_out==B))

    assert(all(c_out==C))
    return a_out,b_out,c_out,a,b,c
def features_dense_io_modular():
	from modshogun import RealFeatures, CSVFile
	feats=RealFeatures()
	f=CSVFile("../data/fm_train_real.dat","r")
	f.set_delimiter(" ")
	feats.load(f)
	return feats
def transfer_multitask_leastsquares_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat):
	from modshogun import RegressionLabels, RealFeatures, Task, TaskGroup
	try:
		from modshogun import MultitaskLeastSquaresRegression
	except ImportError:
		print("MultitaskLeastSquaresRegression not available")
		exit(0)

	features = RealFeatures(traindat)
	labels = RegressionLabels(label_train)

	n_vectors = features.get_num_vectors()
	task_one = Task(0,n_vectors//2)
	task_two = Task(n_vectors//2,n_vectors)
	task_group = TaskGroup()
	task_group.append_task(task_one)
	task_group.append_task(task_two)

	mtlsr = MultitaskLeastSquaresRegression(0.1,features,labels,task_group)
	mtlsr.set_regularization(1) # use regularization ratio
	mtlsr.set_tolerance(1e-2) # use 1e-2 tolerance
	mtlsr.train()
	mtlsr.set_current_task(0)
	out = mtlsr.apply_regression().get_labels()
	return out
def classifier_featureblock_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat):

	from modshogun import BinaryLabels, RealFeatures, IndexBlock, IndexBlockGroup
	try:
		from modshogun import FeatureBlockLogisticRegression
	except ImportError:
		print("FeatureBlockLogisticRegression not available")
		exit(0)

	features = RealFeatures(hstack((traindat,traindat)))
	labels = BinaryLabels(hstack((label_train,label_train)))

	n_features = features.get_num_features()
	block_one = IndexBlock(0,n_features//2)
	block_two = IndexBlock(n_features//2,n_features)
	block_group = IndexBlockGroup()
	block_group.add_block(block_one)
	block_group.add_block(block_two)

	mtlr = FeatureBlockLogisticRegression(0.1,features,labels,block_group)
	mtlr.set_regularization(1) # use regularization ratio
	mtlr.set_tolerance(1e-2) # use 1e-2 tolerance
	mtlr.train()
	out = mtlr.apply().get_labels()

	return out
def transfer_multitask_clustered_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat):
	from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup, MSG_DEBUG
	try:
		from modshogun import MultitaskClusteredLogisticRegression
	except ImportError:
		print("MultitaskClusteredLogisticRegression not available")
		exit()

	features = RealFeatures(hstack((traindat,sin(traindat),cos(traindat))))
	labels = BinaryLabels(hstack((label_train,label_train,label_train)))

	n_vectors = features.get_num_vectors()
	task_one = Task(0,n_vectors//3)
	task_two = Task(n_vectors//3,2*n_vectors//3)
	task_three = Task(2*n_vectors//3,n_vectors)
	task_group = TaskGroup()
	task_group.append_task(task_one)
	task_group.append_task(task_two)
	task_group.append_task(task_three)

	mtlr = MultitaskClusteredLogisticRegression(1.0,100.0,features,labels,task_group,2)
	#mtlr.io.set_loglevel(MSG_DEBUG)
	mtlr.set_tolerance(1e-3) # use 1e-2 tolerance
	mtlr.set_max_iter(100)
	mtlr.train()
	mtlr.set_current_task(0)
	#print mtlr.get_w()
	out = mtlr.apply_regression().get_labels()

	return out
def features_dense_zero_copy_modular (in_data=data):
	feats = None
	if numpy.__version__ >= '1.5':
		feats=numpy.array(in_data, dtype=float64, order='F')

		a=RealFeatures()
		a.frombuffer(feats, False)

		b=numpy.array(a, copy=False)
		c=numpy.array(a, copy=True)

		d=RealFeatures()
		d.frombuffer(a, False)

		e=RealFeatures()
		e.frombuffer(a, True)

		a[:,0]=0
		#print a[0:4]
		#print b[0:4]
		#print c[0:4]
		#print d[0:4]
		#print e[0:4]
	else:
		print("numpy version >= 1.5 is needed")

	return feats
def modelselection_grid_search_kernel (num_subsets, num_vectors, dim_vectors):
	# init seed for reproducability
	Math.init_random(1)
	random.seed(1);

	# create some (non-sense) data
	matrix=random.rand(dim_vectors, num_vectors)

	# create num_feautres 2-dimensional vectors
	features=RealFeatures()
	features.set_feature_matrix(matrix)

	# create labels, two classes
	labels=BinaryLabels(num_vectors)
	for i in range(num_vectors):
		labels.set_label(i, 1 if i%2==0 else -1)

	# create svm
	classifier=LibSVM()

	# splitting strategy
	splitting_strategy=StratifiedCrossValidationSplitting(labels, num_subsets)

	# accuracy evaluation
	evaluation_criterion=ContingencyTableEvaluation(ACCURACY)

	# cross validation class for evaluation in model selection
	cross=CrossValidation(classifier, features, labels, splitting_strategy, evaluation_criterion)
	cross.set_num_runs(1)

	# print all parameter available for modelselection
	# Dont worry if yours is not included, simply write to the mailing list
	#classifier.print_modsel_params()

	# model parameter selection
	param_tree=create_param_tree()
	#param_tree.print_tree()

	grid_search=GridSearchModelSelection(cross, param_tree)

	print_state=False
	best_combination=grid_search.select_model(print_state)
	#print("best parameter(s):")
	#best_combination.print_tree()

	best_combination.apply_to_machine(classifier)

	# larger number of runs to have tighter confidence intervals
	cross.set_num_runs(10)
	cross.set_conf_int_alpha(0.01)
	result=cross.evaluate()
	casted=CrossValidationResult.obtain_from_generic(result);
	#print "result mean:", casted.mean

	return classifier,result,casted.mean
def features_dense_real_modular (A=matrix):

# ... of type Real, LongInt and Byte
    a=RealFeatures(A)

# print(some statistics about a)
#print(a.get_num_vectors())
#print(a.get_num_features())

# get first feature vector and set it
#print(a.get_feature_vector(0))
    a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0)

# get matrix
    a_out = a.get_feature_matrix()

    assert(all(a_out==A))
    return a_out
def multiclass_c45classifiertree_modular(train=traindat,test=testdat,labels=label_traindat,ft=feattypes):
	try:
		from modshogun import RealFeatures, MulticlassLabels, CSVFile, C45ClassifierTree
		from numpy import random, int32
	except ImportError:
		print("Could not import Shogun and/or numpy modules")
		return

	# wrap features and labels into Shogun objects
	feats_train=RealFeatures(CSVFile(train))
	feats_test=RealFeatures(CSVFile(test))
	train_labels=MulticlassLabels(CSVFile(labels))

	# divide train dataset into training and validation subsets in the ratio 2/3 to 1/3
	subset=int32(random.permutation(feats_train.get_num_vectors()))
	vsubset=subset[1:subset.size/3]
	trsubset=subset[1+subset.size/3:subset.size]

	# C4.5 Tree formation using training subset
	train_labels.add_subset(trsubset)
	feats_train.add_subset(trsubset)

	c=C45ClassifierTree()
	c.set_labels(train_labels)
	c.set_feature_types(ft)
	c.train(feats_train)

	train_labels.remove_subset()
	feats_train.remove_subset()

	# prune tree using validation subset
	train_labels.add_subset(vsubset)
	feats_train.add_subset(vsubset)

	c.prune_tree(feats_train,train_labels)

	train_labels.remove_subset()
	feats_train.remove_subset()

	# Classify test data
	output=c.apply_multiclass(feats_test).get_labels()
	output_certainty=c.get_certainty_vector()

	return c,output,output_certainty
def neuralnets_simple_modular (train_fname, test_fname,
		label_fname, C, epsilon):

	from modshogun import NeuralLayers, NeuralNetwork, RealFeatures, BinaryLabels
	from modshogun import Math_init_random, CSVFile
	Math_init_random(17)

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))
	labels=BinaryLabels(CSVFile(label_fname))

	layers = NeuralLayers()
	network = NeuralNetwork(layers.input(feats_train.get_num_features()).linear(50).softmax(2).done())
	network.quick_connect()
	network.initialize_neural_network()

	network.set_labels(labels)
	network.train(feats_train)
	return network, network.apply_multiclass(feats_test)
def transfer_multitask_group_regression(fm_train=traindat,fm_test=testdat,label_train=label_traindat):

	from modshogun import RegressionLabels, RealFeatures, Task, TaskGroup, MultitaskLSRegression

	features = RealFeatures(traindat)
	labels = RegressionLabels(label_train)

	n_vectors = features.get_num_vectors()
	task_one = Task(0,n_vectors/2)
	task_two = Task(n_vectors/2,n_vectors)
	task_group = TaskGroup()
	task_group.add_task(task_one)
	task_group.add_task(task_two)

	mtlsr = MultitaskLSRegression(0.1,features,labels,task_group)
	mtlsr.train()
	mtlsr.set_current_task(0)
	out = mtlsr.apply_regression().get_labels()
	return out
Пример #13
0
def load_data(num_train_samples=7291, m_data_dict=data_dict):
	from modshogun import RealFeatures, MulticlassLabels
	import numpy

	train_vec = m_data_dict['yTr'][0][:num_train_samples].astype(numpy.float64)
	train_labels = MulticlassLabels(train_vec)
	test_vec = m_data_dict['yTe'][0].astype(numpy.float64)
 	test_labels = MulticlassLabels(test_vec)
	print "#train_labels = " + str(train_labels.get_num_labels())
	print "#test_labels  = " + str(test_labels.get_num_labels())

	train_mat = m_data_dict['xTr'][:,:num_train_samples].astype(numpy.float64)
	train_features = RealFeatures(train_mat)
	test_mat = m_data_dict['xTe'].astype(numpy.float64)
	test_features = RealFeatures(test_mat)
	print "#train_vectors = " + str(train_features.get_num_vectors())
	print "#test_vectors  = " + str(test_features.get_num_vectors())
	print "data dimension = " + str(test_features.get_num_features())

	return train_features, train_labels, test_features, test_labels
def preprocessor_randomfouriergausspreproc_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10):
	from modshogun import Chi2Kernel
	from modshogun import RealFeatures
	from modshogun import RandomFourierGaussPreproc

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	preproc=RandomFourierGaussPreproc()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

	return km_train,km_test,kernel
def transfer_multitask_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat):

	from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup, MultitaskLogisticRegression

	features = RealFeatures(hstack((traindat,traindat)))
	labels = BinaryLabels(hstack((label_train,label_train)))

	n_vectors = features.get_num_vectors()
	task_one = Task(0,n_vectors/2)
	task_two = Task(n_vectors/2,n_vectors)
	task_group = TaskGroup()
	task_group.append_task(task_one)
	task_group.append_task(task_two)

	mtlr = MultitaskLogisticRegression(0.1,features,labels,task_group)
	mtlr.set_regularization(1) # use regularization ratio
	mtlr.set_tolerance(1e-2) # use 1e-2 tolerance
	mtlr.train()
	mtlr.set_current_task(0)
	out = mtlr.apply().get_labels()

	return out
def preprocessor_prunevarsubmean_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10):
	from modshogun import Chi2Kernel
	from modshogun import RealFeatures
	from modshogun import PruneVarSubMean

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	preproc=PruneVarSubMean()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

	return km_train,km_test,kernel
Пример #17
0
def metric_lmnn_statistics(
    k=3,
    fname_features="../../data/fm_train_multiclass_digits.dat.gz",
    fname_labels="../../data/label_train_multiclass_digits.dat",
):
    try:
        from modshogun import LMNN, CSVFile, RealFeatures, MulticlassLabels, MSG_DEBUG
        import matplotlib.pyplot as pyplot
    except ImportError:
        print "Error importing modshogun or other required modules. Please, verify their installation."
        return

    features = RealFeatures(load_compressed_features(fname_features).T)
    labels = MulticlassLabels(CSVFile(fname_labels))

    # 	print 'number of examples = %d' % features.get_num_vectors()
    # 	print 'number of features = %d' % features.get_num_features()

    assert features.get_num_vectors() == labels.get_num_labels()

    # train LMNN
    lmnn = LMNN(features, labels, k)
    lmnn.set_correction(100)
    # 	lmnn.io.set_loglevel(MSG_DEBUG)
    print "Training LMNN, this will take about two minutes..."
    lmnn.train()
    print "Training done!"

    # plot objective obtained during training
    statistics = lmnn.get_statistics()

    pyplot.plot(statistics.obj.get())
    pyplot.grid(True)
    pyplot.xlabel("Iterations")
    pyplot.ylabel("LMNN objective")
    pyplot.title("LMNN objective during training for the multiclass digits data set")

    pyplot.show()
Пример #18
0
    def train(self, images, labels):
        """
        Train eigenfaces
        """
        print "Train...",
        #copy labels
        self._labels = labels;

        #transform the numpe vector to shogun structure
        features = RealFeatures(images)
        #PCA
        self.pca = PCA()
        #set dimension
        self.pca.set_target_dim(self._num_components);
        #compute PCA
        self.pca.init(features)

        for sampleIdx in range(features.get_num_vectors()):
            v = features.get_feature_vector(sampleIdx);
            p = self.pca.apply_to_feature_vector(v);
            self._projections.insert(sampleIdx, p);

        print "ok!"
def preprocessor_normone_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10):

	from modshogun import Chi2Kernel
	from modshogun import RealFeatures
	from modshogun import NormOne

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	preprocessor=NormOne()
	preprocessor.init(feats_train)
	feats_train.add_preprocessor(preprocessor)
	feats_train.apply_preprocessor()
	feats_test.add_preprocessor(preprocessor)
	feats_test.apply_preprocessor()

	kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

	return km_train,km_test,kernel
def preprocessor_fisherlda_modular (data, labels, method):

	from modshogun import RealFeatures, MulticlassLabels, CANVAR_FLDA
	from modshogun import FisherLda
	from modshogun import MulticlassLabels

	sg_features = RealFeatures(data)
	sg_labels = MulticlassLabels(labels)
        
	preprocessor=FisherLda(method)
	preprocessor.init(sg_features, sg_labels, 1)
	yn=preprocessor.apply_to_feature_matrix(sg_features)

	return yn
Пример #21
0
 def fit(self, X, y):
   self.X_, y = check_X_y(X, y, dtype=float)
   labels = MulticlassLabels(y)
   self._lmnn = shogun_LMNN(RealFeatures(self.X_.T), labels, self.k)
   self._lmnn.set_maxiter(self.max_iter)
   self._lmnn.set_obj_threshold(self.convergence_tol)
   self._lmnn.set_regularization(self.regularization)
   self._lmnn.set_stepsize(self.learn_rate)
   if self.use_pca:
     self._lmnn.train()
   else:
     self._lmnn.train(np.eye(X.shape[1]))
   self.L_ = self._lmnn.get_linear_transform()
   return self
Пример #22
0
def run_clustering(data, k):
    from modshogun import KMeans
    from modshogun import Math_init_random
    from modshogun import EuclideanDistance
    from modshogun import RealFeatures

    fea = RealFeatures(data)
    distance = EuclideanDistance(fea, fea)
    kmeans = KMeans(k, distance)

    #print("Running clustering...")
    kmeans.train()

    return kmeans.get_cluster_centers()
Пример #23
0
def transfer_multitask_leastsquares_regression(fm_train=traindat,
                                               fm_test=testdat,
                                               label_train=label_traindat):

    from modshogun import RegressionLabels, RealFeatures, Task, TaskGroup, MultitaskLeastSquaresRegression

    features = RealFeatures(traindat)
    labels = RegressionLabels(label_train)

    n_vectors = features.get_num_vectors()
    task_one = Task(0, n_vectors // 2)
    task_two = Task(n_vectors // 2, n_vectors)
    task_group = TaskGroup()
    task_group.append_task(task_one)
    task_group.append_task(task_two)

    mtlsr = MultitaskLeastSquaresRegression(0.1, features, labels, task_group)
    mtlsr.set_regularization(1)  # use regularization ratio
    mtlsr.set_tolerance(1e-2)  # use 1e-2 tolerance
    mtlsr.train()
    mtlsr.set_current_task(0)
    out = mtlsr.apply_regression().get_labels()
    return out
Пример #24
0
        def RunSVMShogun(q):
            totalTimer = Timer()

            Log.Info("Loading dataset", self.verbose)
            trainData, labels = SplitTrainData(self.dataset)
            trainData = RealFeatures(trainData.T)
            labels = MulticlassLabels(labels)
            testData = RealFeatures(LoadDataset(self.dataset[1]).T)

            try:
                with totalTimer:
                    self.model = self.BuildModel(trainData, labels, options)
                    # Run Support vector machines on the test dataset.
                    self.model.apply(testData).get_labels()
            except Exception as e:
                Log.Debug(str(e))
                q.put(-1)
                return -1

            time = totalTimer.ElapsedTime()
            q.put(time)

            return time
Пример #25
0
def converter_localitypreservingprojections_modular(data_fname, k):
    try:
        from modshogun import RealFeatures, LocalityPreservingProjections, CSVFile

        features = RealFeatures(CSVFile(data_fname))
        converter = LocalityPreservingProjections()
        converter.set_target_dim(1)
        converter.set_k(k)
        converter.set_tau(2.0)
        converter.apply(features)

        return features
    except ImportError:
        print('No Eigen3 available')
Пример #26
0
def classifier_libsvm_modular(train_fname=traindat,
                              test_fname=testdat,
                              label_fname=label_traindat,
                              width=2.1,
                              C=1,
                              epsilon=1e-5):
    from modshogun import RealFeatures, BinaryLabels
    from modshogun import GaussianKernel, LibSVM, CSVFile

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))
    labels = BinaryLabels(CSVFile(label_fname))
    kernel = GaussianKernel(feats_train, feats_train, width)

    svm = LibSVM(C, kernel, labels)
    svm.set_epsilon(epsilon)
    svm.train()

    supportvectors = sv_idx = svm.get_support_vectors()
    alphas = svm.get_alphas()
    predictions = svm.apply(feats_test)
    #print predictions.get_labels()
    return predictions, svm, predictions.get_labels()
Пример #27
0
def classifier_svmlin_modular(train_fname=traindat,
                              test_fname=testdat,
                              label_fname=label_traindat,
                              C=0.9,
                              epsilon=1e-5,
                              num_threads=1):
    from modshogun import RealFeatures, SparseRealFeatures, BinaryLabels
    from modshogun import SVMLin, CSVFile

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))
    labels = BinaryLabels(CSVFile(label_fname))

    svm = SVMLin(C, feats_train, labels)
    svm.set_epsilon(epsilon)
    svm.parallel.set_num_threads(num_threads)
    svm.set_bias_enabled(True)
    svm.train()

    bias = svm.get_bias()
    w = svm.get_w()
    predictions = svm.apply(feats_test)
    return predictions, svm, predictions.get_labels()
def classifier_featureblock_logistic_regression(fm_train=traindat,
                                                fm_test=testdat,
                                                label_train=label_traindat):

    from modshogun import BinaryLabels, RealFeatures, IndexBlock, IndexBlockGroup, FeatureBlockLogisticRegression

    features = RealFeatures(hstack((traindat, traindat)))
    labels = BinaryLabels(hstack((label_train, label_train)))

    n_features = features.get_num_features()
    block_one = IndexBlock(0, n_features / 2)
    block_two = IndexBlock(n_features / 2, n_features)
    block_group = IndexBlockGroup()
    block_group.add_block(block_one)
    block_group.add_block(block_two)

    mtlr = FeatureBlockLogisticRegression(0.1, features, labels, block_group)
    mtlr.set_regularization(1)  # use regularization ratio
    mtlr.set_tolerance(1e-2)  # use 1e-2 tolerance
    mtlr.train()
    out = mtlr.apply().get_labels()

    return out
Пример #29
0
    def train(self, images, labels):
        """
        Train eigenfaces
        """
        print "Train..."
        #copy labels
        self._labels = labels

        #transform the numpe vector to shogun structure
        features = RealFeatures(images)
        #PCA
        self.pca = PCA()
        #set dimension
        self.pca.set_target_dim(self._num_components)
        #compute PCA
        self.pca.init(features)

        for sampleIdx in range(features.get_num_vectors()):
            v = features.get_feature_vector(sampleIdx)
            p = self.pca.apply_to_feature_vector(v)
            self._projections.insert(sampleIdx, p)

        print "Train ok!"
Пример #30
0
        def RunKNCShogun():
            totalTimer = Timer()

            Log.Info("Loading dataset", self.verbose)
            trainData, labels = SplitTrainData(self.dataset)
            trainData = RealFeatures(trainData.T)
            labels = MulticlassLabels(labels)
            testData = RealFeatures(LoadDataset(self.dataset[1]).T)

            try:
                with totalTimer:
                    self.model = self.BuildModel(trainData, labels, options)
                    # Run the k-nearest neighbors Classifier on the test dataset.
                    self.predictions = self.model.apply_multiclass(
                        testData).get_labels()
            except Exception as e:
                return [-1]

            time = totalTimer.ElapsedTime()
            if len(self.dataset) > 1:
                return [time, self.predictions]

            return [time]
Пример #31
0
def converter_isomap_modular(data_fname):
    try:
        from modshogun import RealFeatures, Isomap, CSVFile

        features = RealFeatures(CSVFile(data))

        converter = Isomap()
        converter.set_k(20)
        converter.set_target_dim(1)
        converter.apply(features)

        return features
    except ImportError:
        print('No Eigen3 available')
Пример #32
0
def clustering_kmeans_modular (fm_train=traindat,k=3):
	from modshogun import EuclideanDistance, RealFeatures, KMeans, Math_init_random, CSVFile
	Math_init_random(17)

	feats_train=RealFeatures(CSVFile(fm_train))
	distance=EuclideanDistance(feats_train, feats_train)

	kmeans=KMeans(k, distance)
	kmeans.train()

	out_centers = kmeans.get_cluster_centers()
	kmeans.get_radiuses()

	return out_centers, kmeans
def converter_linearlocaltangentspacealignment_modular(data_fname, k):
    try:
        from modshogun import RealFeatures, LinearLocalTangentSpaceAlignment, CSVFile

        features = RealFeatures(CSVFile(data_fname))

        converter = LinearLocalTangentSpaceAlignment()
        converter.set_target_dim(1)
        converter.set_k(k)
        converter.apply(features)

        return features
    except ImportError:
        print('No Eigen3 available')
    def RunMetrics(self, options):
        if len(self.dataset) >= 3:

            X, y = SplitTrainData(self.dataset)
            tau = re.search("-t (\d+)", options)
            tau = 1.0 if not tau else int(tau.group(1))
            model = LRR(tau, RealFeatures(X.T), RegressionLabels(y))
            model.train()

            testData = LoadDataset(self.dataset[1])
            truelabels = LoadDataset(self.dataset[2])

            predictedlabels = model.apply_regression(RealFeatures(
                testData.T)).get_labels()

            SimpleMSE = Metrics.SimpleMeanSquaredError(truelabels,
                                                       predictedlabels)
            metrics_dict = {}
            metrics_dict['Simple MSE'] = SimpleMSE
            return metrics_dict

        else:
            Log.Fatal("This method requires three datasets!")
def classifier_multilabeloutputliblinear_modular(
        fm_train_real=traindat,
        fm_test_real=testdat,
        label_train_multiclass=label_traindat,
        label_test_multiclass=label_testdat,
        width=2.1,
        C=1,
        epsilon=1e-5):
    from modshogun import RealFeatures, MulticlassLabels, MultilabelLabels
    from modshogun import MulticlassLibLinear

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)

    labels = MulticlassLabels(label_train_multiclass)

    classifier = MulticlassLibLinear(C, feats_train, labels)
    classifier.train()

    label_pred = classifier.apply_multilabel_output(feats_test, 2)
    out = label_pred.get_labels()
    #print out
    return out
Пример #36
0
def converter_hessianlocallylinearembedding_modular(data_fname, k):
    try:
        from modshogun import RealFeatures, HessianLocallyLinearEmbedding, CSVFile

        features = RealFeatures(CSVFile(data))

        converter = HessianLocallyLinearEmbedding()
        converter.set_target_dim(1)
        converter.set_k(k)
        converter.apply(features)

        return features
    except ImportError:
        print('No Eigen3 available')
Пример #37
0
    def RunMetrics(self, options):
        Log.Info("Perform Linear Ridge Regression.", self.verbose)

        results = self.LinearRidgeRegressionShogun(options)
        if results < 0:
            return results

        metrics = {'Runtime': results}

        if len(self.dataset) >= 3:

            X, y = SplitTrainData(self.dataset)
            if "alpha" in options:
                tau = float(options.pop("alpha"))
            else:
                Log.Fatal("Required parameter 'alpha' not specified!")
                raise Exception("missing parameter")

            if len(options) > 0:
                Log.Fatal("Unknown parameters: " + str(options))
                raise Exception("unknown parameters")
            model = LRR(tau, RealFeatures(X.T), RegressionLabels(y))
            model.train()

            testData = LoadDataset(self.dataset[1])
            truelabels = LoadDataset(self.dataset[2])

            predictedlabels = model.apply_regression(RealFeatures(
                testData.T)).get_labels()

            SimpleMSE = Metrics.SimpleMeanSquaredError(truelabels,
                                                       predictedlabels)
            metrics['Simple MSE'] = SimpleMSE
            return metrics

        else:
            Log.Fatal("This method requires three datasets!")
Пример #38
0
def converter_laplacianeigenmaps_modular (data_fname,k):
	try:
		from modshogun import RealFeatures, LaplacianEigenmaps, CSVFile

		features = RealFeatures(CSVFile(data_fname))

		converter = LaplacianEigenmaps()
		converter.set_target_dim(1)
		converter.set_k(k)
		converter.set_tau(20.0)
		converter.apply(features)

		return features
	except ImportError:
		print('No Eigen3 available')
Пример #39
0
def preprocessor_dimensionreductionpreprocessor_modular (data, k):
	from modshogun import RealFeatures
	from modshogun import DimensionReductionPreprocessor
	from modshogun import LocallyLinearEmbedding

	features = RealFeatures(data)

	converter = LocallyLinearEmbedding()
	converter.set_k(k)

	preprocessor = DimensionReductionPreprocessor(converter)
	preprocessor.init(features)
	preprocessor.apply_to_feature_matrix(features)

	return features
def multiclass_chaidtree_modular(train=traindat,
                                 test=testdat,
                                 labels=label_traindat,
                                 ft=feattypes):
    try:
        from modshogun import RealFeatures, MulticlassLabels, CSVFile, CHAIDTree
    except ImportError:
        print("Could not import Shogun modules")
        return

    # wrap features and labels into Shogun objects
    feats_train = RealFeatures(CSVFile(train))
    feats_test = RealFeatures(CSVFile(test))
    train_labels = MulticlassLabels(CSVFile(labels))

    # CHAID Tree formation with nominal dependent variable
    c = CHAIDTree(0, feattypes, 10)
    c.set_labels(train_labels)
    c.train(feats_train)

    # Classify test data
    output = c.apply_multiclass(feats_test).get_labels()

    return c, output
def regression_randomforest_modular(num_train=500,
                                    num_test=50,
                                    x_range=15,
                                    noise_var=0.2,
                                    ft=feattypes):
    try:
        from modshogun import RealFeatures, RegressionLabels, CSVFile, RandomForest, MeanRule, PT_REGRESSION
    except ImportError:
        print("Could not import Shogun modules")
        return

    random.seed(1)

    # form training dataset : y=x with noise
    X_train = random.rand(1, num_train) * x_range
    Y_train = X_train + random.randn(num_train) * noise_var

    # form test dataset
    X_test = array([[float(i) / num_test * x_range for i in range(num_test)]])

    # wrap features and labels into Shogun objects
    feats_train = RealFeatures(X_train)
    feats_test = RealFeatures(X_test)
    train_labels = RegressionLabels(Y_train[0])

    # Random Forest formation
    rand_forest = RandomForest(feats_train, train_labels, 20, 1)
    rand_forest.set_feature_types(ft)
    rand_forest.set_machine_problem_type(PT_REGRESSION)
    rand_forest.set_combination_rule(MeanRule())
    rand_forest.train()

    # Regress test data
    output = rand_forest.apply_regression(feats_test).get_labels()

    return rand_forest, output
Пример #42
0
 def fit(self, X, labels):
   self.X = X
   self.L = np.eye(X.shape[1])
   labels = MulticlassLabels(labels.astype(np.float64))
   self._lmnn = shogun_LMNN(RealFeatures(X.T), labels, self.params['k'])
   self._lmnn.set_maxiter(self.params['max_iter'])
   self._lmnn.set_obj_threshold(self.params['convergence_tol'])
   self._lmnn.set_regularization(self.params['regularization'])
   self._lmnn.set_stepsize(self.params['learn_rate'])
   if self.params['use_pca']:
     self._lmnn.train()
   else:
     self._lmnn.train(self.L)
   self.L = self._lmnn.get_linear_transform()
   return self
Пример #43
0
def kernel_io_modular(train_fname=traindat, test_fname=testdat, width=1.9):
    from modshogun import RealFeatures, GaussianKernel, CSVFile

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))

    kernel = GaussianKernel(feats_train, feats_train, width)
    km_train = kernel.get_kernel_matrix()
    f = CSVFile("tmp/gaussian_train.csv", "w")
    kernel.save(f)
    del f

    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    f = CSVFile("tmp/gaussian_test.csv", "w")
    kernel.save(f)
    del f

    #clean up
    import os
    os.unlink("tmp/gaussian_test.csv")
    os.unlink("tmp/gaussian_train.csv")

    return km_train, km_test, kernel
Пример #44
0
 def fit(self, X, y):
   X, y = self._prepare_inputs(X, y, dtype=float,
                               ensure_min_samples=2)
   labels = MulticlassLabels(y)
   self._lmnn = shogun_LMNN(RealFeatures(X.T), labels, self.k)
   self._lmnn.set_maxiter(self.max_iter)
   self._lmnn.set_obj_threshold(self.convergence_tol)
   self._lmnn.set_regularization(self.regularization)
   self._lmnn.set_stepsize(self.learn_rate)
   if self.use_pca:
     self._lmnn.train()
   else:
     self._lmnn.train(np.eye(X.shape[1]))
   self.transformer_ = self._lmnn.get_linear_transform(X)
   return self
def classifier_multiclasslibsvm_modular(fm_train_real=traindat,
                                        fm_test_real=testdat,
                                        label_train_multiclass=label_traindat,
                                        width=2.1,
                                        C=1,
                                        epsilon=1e-5):
    from modshogun import RealFeatures, MulticlassLabels
    from modshogun import GaussianKernel
    from modshogun import MulticlassLibSVM

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)
    kernel = GaussianKernel(feats_train, feats_train, width)

    labels = MulticlassLabels(label_train_multiclass)

    svm = MulticlassLibSVM(C, kernel, labels)
    svm.set_epsilon(epsilon)
    svm.train()

    kernel.init(feats_train, feats_test)
    out = svm.apply().get_labels()
    predictions = svm.apply()
    return predictions, svm, predictions.get_labels()
Пример #46
0
        def RunLinearRegressionShogun(q):
            totalTimer = Timer()

            # Load input dataset.
            # If the dataset contains two files then the second file is the responses
            # file.
            try:
                Log.Info("Loading dataset", self.verbose)
                if len(self.dataset) == 2:
                    testSet = np.genfromtxt(self.dataset[1], delimiter=',')

                # Use the last row of the training set as the responses.
                X, y = SplitTrainData(self.dataset)

                if len(options) > 0:
                    Log.Fatal("Unknown parameters: " + str(options))
                    raise Exception("unknown parameters")

                with totalTimer:
                    # Perform linear regression.
                    model = LeastSquaresRegression(RealFeatures(X.T),
                                                   RegressionLabels(y))
                    model.train()
                    b = model.get_w()

                    if len(self.dataset) == 2:
                        pred = classifier.apply(RealFeatures(testSet.T))
                        self.predictions = pred.get_labels()

            except Exception as e:
                q.put(-1)
                return -1

            time = totalTimer.ElapsedTime()
            q.put(time)
            return time
Пример #47
0
def compute_output_plot_isolines_sine(classifier,
                                      kernel,
                                      train,
                                      regression=False):
    x = 4 * rand(1, 500) - 2
    x.sort()
    test = RealFeatures(x)
    kernel.init(train, test)

    if regression:
        y = classifier.apply().get_labels()
    else:
        y = classifier.apply().get_values()

    return x, y
def transfer_multitask_logistic_regression(fm_train=traindat,
                                           fm_test=testdat,
                                           label_train=label_traindat):

    from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup, MultitaskLogisticRegression

    features = RealFeatures(hstack((traindat, traindat)))
    labels = BinaryLabels(hstack((label_train, label_train)))

    n_vectors = features.get_num_vectors()
    task_one = Task(0, n_vectors // 2)
    task_two = Task(n_vectors // 2, n_vectors)
    task_group = TaskGroup()
    task_group.append_task(task_one)
    task_group.append_task(task_two)

    mtlr = MultitaskLogisticRegression(0.1, features, labels, task_group)
    mtlr.set_regularization(1)  # use regularization ratio
    mtlr.set_tolerance(1e-2)  # use 1e-2 tolerance
    mtlr.train()
    mtlr.set_current_task(0)
    out = mtlr.apply().get_labels()

    return out
Пример #49
0
def converter_diffusionmaps_modular(data_fname, t):
    try:
        from modshogun import RealFeatures, DiffusionMaps, GaussianKernel, CSVFile

        features = RealFeatures(CSVFile(data_fname))

        converter = DiffusionMaps()
        converter.set_target_dim(1)
        converter.set_kernel(GaussianKernel(10, 10.0))
        converter.set_t(t)
        converter.apply(features)

        return features
    except ImportError:
        print('No Eigen3 available')
def stochasticgbmachine_modular(train=traindat,train_labels=label_traindat,ft=feat_types):
	try:
		from modshogun import RealFeatures, RegressionLabels, CSVFile, CARTree, StochasticGBMachine, SquaredLoss
	except ImportError:
		print("Could not import Shogun modules")
		return

	# wrap features and labels into Shogun objects
	feats=RealFeatures(CSVFile(train))
	labels=RegressionLabels(CSVFile(train_labels))

	# divide into training (90%) and test dataset (10%)
	p=np.random.permutation(labels.get_num_labels())
	num=labels.get_num_labels()*0.9

	cart=CARTree()
	cart.set_feature_types(ft)
	cart.set_max_depth(1)
	loss=SquaredLoss()
	s=StochasticGBMachine(cart,loss,500,0.01,0.6)

	# train
	feats.add_subset(np.int32(p[0:num]))
	labels.add_subset(np.int32(p[0:num]))
	s.set_labels(labels)
	s.train(feats)
	feats.remove_subset()
	labels.remove_subset()

	# apply
	feats.add_subset(np.int32(p[num:len(p)]))
	labels.add_subset(np.int32(p[num:len(p)]))
	output=s.apply_regression(feats)

	feats.remove_subset()
	labels.remove_subset()

	return s,output
Пример #51
0
def statistics_hsic (n, difference, angle):
	from modshogun import RealFeatures
	from modshogun import DataGenerator
	from modshogun import GaussianKernel
	from modshogun import HSIC
	from modshogun import BOOTSTRAP, HSIC_GAMMA
	from modshogun import EuclideanDistance
	from modshogun import Math, Statistics, IntVector

	# init seed for reproducability
	Math.init_random(1)

	# note that the HSIC has to store kernel matrices
	# which upper bounds the sample size

	# use data generator class to produce example data
	data=DataGenerator.generate_sym_mix_gauss(n,difference,angle)
	#plot(data[0], data[1], 'x');show()

	# create shogun feature representation
	features_x=RealFeatures(array([data[0]]))
	features_y=RealFeatures(array([data[1]]))

	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	subset=IntVector.randperm_vec(features_x.get_num_vectors())
	subset=subset[0:200]
	features_x.add_subset(subset)
	dist=EuclideanDistance(features_x, features_x)
	distances=dist.get_distance_matrix()
	features_x.remove_subset()
	median_distance=Statistics.matrix_median(distances, True)
	sigma_x=median_distance**2
	features_y.add_subset(subset)
	dist=EuclideanDistance(features_y, features_y)
	distances=dist.get_distance_matrix()
	features_y.remove_subset()
	median_distance=Statistics.matrix_median(distances, True)
	sigma_y=median_distance**2
	#print "median distance for Gaussian kernel on x:", sigma_x
	#print "median distance for Gaussian kernel on y:", sigma_y
	kernel_x=GaussianKernel(10,sigma_x)
	kernel_y=GaussianKernel(10,sigma_y)

	hsic=HSIC(kernel_x,kernel_y,features_x,features_y)

	# perform test: compute p-value and test if null-hypothesis is rejected for
	# a test level of 0.05 using different methods to approximate
	# null-distribution
	statistic=hsic.compute_statistic()
	#print "HSIC:", statistic
	alpha=0.05

	#print "computing p-value using bootstrapping"
	hsic.set_null_approximation_method(BOOTSTRAP)
	# normally, at least 250 iterations should be done, but that takes long
	hsic.set_bootstrap_iterations(100)
	# bootstrapping allows usage of unbiased or biased statistic
	p_value_boot=hsic.compute_p_value(statistic)
	thresh_boot=hsic.compute_threshold(alpha)
	#print "p_value:", p_value_boot
	#print "threshold for 0.05 alpha:", thresh_boot
	#print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha

	#print "computing p-value using gamma method"
	hsic.set_null_approximation_method(HSIC_GAMMA)
	p_value_gamma=hsic.compute_p_value(statistic)
	thresh_gamma=hsic.compute_threshold(alpha)
	#print "p_value:", p_value_gamma
	#print "threshold for 0.05 alpha:", thresh_gamma
	#print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value_gamma<alpha

	# sample from null distribution (these may be plotted or whatsoever)
	# mean should be close to zero, variance stronly depends on data/kernel
	# bootstrapping, biased statistic
	#print "sampling null distribution using bootstrapping"
	hsic.set_null_approximation_method(BOOTSTRAP)
	hsic.set_bootstrap_iterations(100)
	null_samples=hsic.bootstrap_null()
	#print "null mean:", mean(null_samples)
	#print "null variance:", var(null_samples)
	#hist(null_samples, 100); show()

	return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples
Пример #52
0
def predict_new_data(graph_file, cons_file, tri_file, other_feature_file):
    print "reading extracted features"
    graph_feature = read_feature_data(graph_file)
    graph_feature = get_normalized_given_max_min(graph_feature, "models/grtaph_max_size")
    cons_feature = read_feature_data(cons_file)
    cons_feature = get_normalized_given_max_min(cons_feature, "models/cons_max_size")
    CC_feature = read_feature_data(tri_file)
    CC_feature = get_normalized_given_max_min(CC_feature, "models/tri_max_size")
    ATOS_feature = read_feature_data(other_feature_file)
    ATOS_feature = get_normalized_given_max_min(ATOS_feature, "models/alu_max_size")

    width, C, epsilon, num_threads, mkl_epsilon, mkl_norm = 0.5, 1.2, 1e-5, 1, 0.001, 3.5
    kernel = CombinedKernel()
    feats_train = CombinedFeatures()
    feats_test = CombinedFeatures()

    # pdb.set_trace()
    subkfeats_train = RealFeatures()
    subkfeats_test = RealFeatures(np.transpose(np.array(graph_feature)))
    subkernel = GaussianKernel(10, width)
    feats_test.append_feature_obj(subkfeats_test)

    fstream = SerializableAsciiFile("models/graph.dat", "r")
    status = subkfeats_train.load_serializable(fstream)
    feats_train.append_feature_obj(subkfeats_train)
    kernel.append_kernel(subkernel)

    subkfeats_train = RealFeatures()
    subkfeats_test = RealFeatures(np.transpose(np.array(cons_feature)))
    subkernel = GaussianKernel(10, width)
    feats_test.append_feature_obj(subkfeats_test)

    fstream = SerializableAsciiFile("models/cons.dat", "r")
    status = subkfeats_train.load_serializable(fstream)
    feats_train.append_feature_obj(subkfeats_train)
    kernel.append_kernel(subkernel)

    subkfeats_train = RealFeatures()
    subkfeats_test = RealFeatures(np.transpose(np.array(CC_feature)))
    subkernel = GaussianKernel(10, width)
    feats_test.append_feature_obj(subkfeats_test)

    fstream = SerializableAsciiFile("models/tri.dat", "r")
    status = subkfeats_train.load_serializable(fstream)
    feats_train.append_feature_obj(subkfeats_train)
    kernel.append_kernel(subkernel)

    subkfeats_train = RealFeatures()
    subkfeats_test = RealFeatures(np.transpose(np.array(ATOS_feature)))
    subkernel = GaussianKernel(10, width)
    feats_test.append_feature_obj(subkfeats_test)

    fstream = SerializableAsciiFile("models/alu.dat", "r")
    status = subkfeats_train.load_serializable(fstream)
    feats_train.append_feature_obj(subkfeats_train)
    kernel.append_kernel(subkernel)

    model_file = "models/mkl.dat"
    if not os.path.exists(model_file):
        print "downloading model file"
        url_add = "http://rth.dk/resources/mirnasponge/data/mkl.dat"
        urllib.urlretrieve(url_add, model_file)
    print "loading trained model"
    fstream = SerializableAsciiFile("models/mkl.dat", "r")
    new_mkl = MKLClassification()
    status = new_mkl.load_serializable(fstream)

    print "model predicting"
    kernel.init(feats_train, feats_test)
    new_mkl.set_kernel(kernel)
    y_out = new_mkl.apply().get_labels()

    return y_out
Пример #53
0
def plot_data(x,y,axis):
	for idx,val in enumerate(numpy.unique(y)):
		xi = x[y==val]
		axis.scatter(xi[:,0], xi[:,1], s=50, facecolors='none', edgecolors=COLS[idx])

def plot_neighborhood_graph(x, nn, axis):
	for i in xrange(x.shape[0]):
		xs = [x[i,0], x[nn[1,i], 0]]
		ys = [x[i,1], x[nn[1,i], 1]]
		axis.plot(xs, ys, COLS[int(y[i])])

figure, axarr = pyplot.subplots(3, 1)
x, y = sandwich_data()

features = RealFeatures(x.T)
labels = MulticlassLabels(y)

print('%d vectors with %d features' % (features.get_num_vectors(), features.get_num_features()))
assert(features.get_num_vectors() == labels.get_num_labels())

distance = EuclideanDistance(features, features)
k = 2
knn = KNN(k, distance, labels)

plot_data(x, y, axarr[0])
plot_neighborhood_graph(x, knn.nearest_neighbors(), axarr[0])
axarr[0].set_aspect('equal')
axarr[0].set_xlim(-6, 4)
axarr[0].set_ylim(-3, 2)
Пример #54
0
	knn = KNN(k, distance, train_labels)
	knn.train()

	test_features, test_labels = testdat.features, testdat.labels

	predicted_labels = knn.apply(test_features)
	evaluator = MulticlassAccuracy()
	acc = evaluator.evaluate(predicted_labels, test_labels)
	err = 1-acc

	return err

features_file = '../data/fm_ape_gut.txt'
labels_file = '../data/label_ape_gut.txt'

features = RealFeatures(CSVFile(features_file))
labels = MulticlassLabels(CSVFile(labels_file))

# reduce the number of features to use so that the training is faster but still
# the results of feature selection are significant
fm = features.get_feature_matrix()
features = RealFeatures(fm[:500, :])

assert(features.get_num_vectors() == labels.get_num_labels())

print('Number of examples = %d, number of features = %d.' % (features.get_num_vectors(), features.get_num_features()))

visualize_tdsne(features, labels)
lmnn = diagonal_lmnn(features, labels, max_iter=1200)

diagonal_transform = lmnn.get_linear_transform()
Пример #55
0
#!/usr/bin/python

from scipy import io

data_dict = io.loadmat('../data/NBData20_train_preprocessed.mat')

xt = data_dict['xt']
yt = data_dict['yt']

import numpy
from modshogun import RealFeatures,MulticlassLabels,LMNN,MSG_DEBUG

features = RealFeatures(xt.T)
labels = MulticlassLabels(numpy.squeeze(yt))

k = 6
lmnn = LMNN(features,labels,k)
lmnn.io.set_loglevel(MSG_DEBUG)
lmnn.set_diagonal(True)
lmnn.set_maxiter(10000)
lmnn.train(numpy.eye(features.get_num_features()))

Пример #56
0
def feature_function():
    
    from modshogun import RealFeatures
    from modshogun import CSVFile
    import numpy as np

    #3x3 random matrix 
    feat_arr = np.random.rand(3, 3)
    
    #initialize RealFeatures from numpy array
    features = RealFeatures(feat_arr)

    #get matrix value function
    print features.get_feature_matrix(features)
    
    #get selected column of matrix
    print features.get_feature_vector(1)

    #get number of columns
    print features.get_num_features()

    #get number of rows
    print features.get_num_vectors()
    
    feats_from_csv = RealFeatures(CSVFile("csv/feature.csv"))
    print "csv is ", feats_from_csv.get_feature_matrix()
#!/usr/bin/env python2.7
#
# This software is distributed under BSD 3-clause license (see LICENSE file).
#
# Copyright (C) 2014 Thoralf Klein
#

from modshogun import RealFeatures, BinaryLabels, LibLinear
from numpy import random, mean

X_train = RealFeatures(random.randn(30, 100))
Y_train = BinaryLabels(random.randn(X_train.get_num_vectors()))

svm = LibLinear(1.0, X_train, Y_train)
svm.train()

Y_pred = svm.apply_binary(X_train)
Y_train.get_labels() == Y_pred.get_labels()

print "accuracy:", mean(Y_train.get_labels() == Y_pred.get_labels())
Пример #58
0
#!/usr/bin/python

from modshogun import CSVFile, RealFeatures, RescaleFeatures
from scipy.linalg import solve_triangular, cholesky, sqrtm, inv
import matplotlib.pyplot as pyplot
import numpy

# load wine features
features = RealFeatures(CSVFile('../data/fm_wine.dat'))

print('%d vectors with %d features.' % (features.get_num_vectors(), features.get_num_features()))
print('original features mean = ' + str(numpy.mean(features, axis=1)))

# rescale the features to [0,1]
feature_rescaling = RescaleFeatures()
feature_rescaling.init(features)
features.add_preprocessor(feature_rescaling)
features.apply_preprocessor()

print('mean after rescaling = ' + str(numpy.mean(features, axis=1)))

# remove mean from data
data = features.get_feature_matrix()
data = data.T
data-= numpy.mean(data, axis=0)
print numpy.mean(data, axis=0)

fig, axarr = pyplot.subplots(1,2)
axarr[0].matshow(numpy.cov(data.T))

#### whiten data
def serialization_complex_example (num=5, dist=1, dim=10, C=2.0, width=10):
	import os
	from numpy import concatenate, zeros, ones
	from numpy.random import randn, seed
	from modshogun import RealFeatures, MulticlassLabels
	from modshogun import GMNPSVM
	from modshogun import GaussianKernel
	try:
		from modshogun import SerializableHdf5File,SerializableAsciiFile, \
				SerializableJsonFile,SerializableXmlFile,MSG_DEBUG
	except ImportError:
		return
	from modshogun import NormOne, LogPlusOne

	seed(17)

	data=concatenate((randn(dim, num), randn(dim, num) + dist,
					  randn(dim, num) + 2*dist,
					  randn(dim, num) + 3*dist), axis=1)
	lab=concatenate((zeros(num), ones(num), 2*ones(num), 3*ones(num)))

	feats=RealFeatures(data)
	#feats.io.set_loglevel(MSG_DEBUG)
	#feats.io.enable_file_and_line()
	kernel=GaussianKernel(feats, feats, width)

	labels=MulticlassLabels(lab)

	svm = GMNPSVM(C, kernel, labels)

	feats.add_preprocessor(NormOne())
	feats.add_preprocessor(LogPlusOne())
	feats.set_preprocessed(1)
	svm.train(feats)
	bias_ref = svm.get_svm(0).get_bias()

	#svm.print_serializable()

	fstream = SerializableHdf5File("blaah.h5", "w")
	status = svm.save_serializable(fstream)
	check_status(status,'h5')

	fstream = SerializableAsciiFile("blaah.asc", "w")
	status = svm.save_serializable(fstream)
	check_status(status,'asc')

	fstream = SerializableJsonFile("blaah.json", "w")
	status = svm.save_serializable(fstream)
	check_status(status,'json')

	fstream = SerializableXmlFile("blaah.xml", "w")
	status = svm.save_serializable(fstream)
	check_status(status,'xml')

	fstream = SerializableHdf5File("blaah.h5", "r")
	new_svm=GMNPSVM()
	status = new_svm.load_serializable(fstream)
	check_status(status,'h5')
	new_svm.train()
	bias_h5 = new_svm.get_svm(0).get_bias()

	fstream = SerializableAsciiFile("blaah.asc", "r")
	new_svm=GMNPSVM()
	status = new_svm.load_serializable(fstream)
	check_status(status,'asc')
	new_svm.train()
	bias_asc = new_svm.get_svm(0).get_bias()

	fstream = SerializableJsonFile("blaah.json", "r")
	new_svm=GMNPSVM()
	status = new_svm.load_serializable(fstream)
	check_status(status,'json')
	new_svm.train()
	bias_json = new_svm.get_svm(0).get_bias()

	fstream = SerializableXmlFile("blaah.xml", "r")
	new_svm=GMNPSVM()
	status = new_svm.load_serializable(fstream)
	check_status(status,'xml')
	new_svm.train()
	bias_xml = new_svm.get_svm(0).get_bias()

	os.unlink("blaah.h5")
	os.unlink("blaah.asc")
	os.unlink("blaah.json")
	os.unlink("blaah.xml")
	return svm,new_svm, bias_ref, bias_h5, bias_asc, bias_json, bias_xml
Пример #60
0
def hsic_graphical():
	# parameters, change to get different results
	m=250
	difference=3

	# setting the angle lower makes a harder test
	angle=pi/30

	# number of samples taken from null and alternative distribution
	num_null_samples=500

	# use data generator class to produce example data
	data=DataGenerator.generate_sym_mix_gauss(m,difference,angle)

	# create shogun feature representation
	features_x=RealFeatures(array([data[0]]))
	features_y=RealFeatures(array([data[1]]))

	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	subset=int32(array([x for x in range(features_x.get_num_vectors())])) # numpy
	subset=random.permutation(subset) # numpy permutation
	subset=subset[0:200]
	features_x.add_subset(subset)
	dist=EuclideanDistance(features_x, features_x)
	distances=dist.get_distance_matrix()
	features_x.remove_subset()
	median_distance=np.median(distances)
	sigma_x=median_distance**2
	features_y.add_subset(subset)
	dist=EuclideanDistance(features_y, features_y)
	distances=dist.get_distance_matrix()
	features_y.remove_subset()
	median_distance=np.median(distances)
	sigma_y=median_distance**2
	print "median distance for Gaussian kernel on x:", sigma_x
	print "median distance for Gaussian kernel on y:", sigma_y
	kernel_x=GaussianKernel(10,sigma_x)
	kernel_y=GaussianKernel(10,sigma_y)

	# create hsic instance. Note that this is a convienience constructor which copies
	# feature data. features_x and features_y are not these used in hsic.
	# This is only for user-friendlyness. Usually, its ok to do this.
	# Below, the alternative distribution is sampled, which means
	# that new feature objects have to be created in each iteration (slow)
	# However, normally, the alternative distribution is not sampled
	hsic=HSIC(kernel_x,kernel_y,features_x,features_y)

	# sample alternative distribution
	alt_samples=zeros(num_null_samples)
	for i in range(len(alt_samples)):
		data=DataGenerator.generate_sym_mix_gauss(m,difference,angle)
		features_x.set_feature_matrix(array([data[0]]))
		features_y.set_feature_matrix(array([data[1]]))

		# re-create hsic instance everytime since feature objects are copied due to
		# useage of convienience constructor
		hsic=HSIC(kernel_x,kernel_y,features_x,features_y)
		alt_samples[i]=hsic.compute_statistic()

	# sample from null distribution
	# permutation, biased statistic
	hsic.set_null_approximation_method(PERMUTATION)
	hsic.set_num_null_samples(num_null_samples)
	null_samples_boot=hsic.sample_null()

	# fit gamma distribution, biased statistic
	hsic.set_null_approximation_method(HSIC_GAMMA)
	gamma_params=hsic.fit_null_gamma()
	# sample gamma with parameters
	null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)])

	# plot
	figure()

	# plot data x and y
	subplot(2,2,1)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks
	grid(True)
	plot(data[0], data[1], 'o')
	title('Data, rotation=$\pi$/'+str(1/angle*pi)+'\nm='+str(m))
	xlabel('$x$')
	ylabel('$y$')

	# compute threshold for test level
	alpha=0.05
	null_samples_boot.sort()
	null_samples_gamma.sort()
	thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))];
	thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))];

	type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples)
	type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples)

	# plot alternative distribution with threshold
	subplot(2,2,2)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	grid(True)
	hist(alt_samples, 20, normed=True);
	axvline(thresh_boot, 0, 1, linewidth=2, color='red')
	type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples)
	title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error))

	# compute range for all null distribution histograms
	hist_range=[min([min(null_samples_boot), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_gamma)])]

	# plot null distribution with threshold
	subplot(2,2,3)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	grid(True)
	hist(null_samples_boot, 20, range=hist_range, normed=True);
	axvline(thresh_boot, 0, 1, linewidth=2, color='red')
	title('Sampled Null Dist.\n' + 'Type I error is '  + str(type_one_error_boot))

	# plot null distribution gamma
	subplot(2,2,4)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	grid(True)
	hist(null_samples_gamma, 20, range=hist_range, normed=True);
	axvline(thresh_gamma, 0, 1, linewidth=2, color='red')
	title('Null Dist. Gamma\nType I error is '  + str(type_one_error_gamma))
	grid(True)

	# pull plots a bit apart
	subplots_adjust(hspace=0.5)
	subplots_adjust(wspace=0.5)