def classifier_svmlight_linear_term_modular(fm_train_dna=traindna,fm_test_dna=testdna, \
                                                label_train_dna=label_traindna,degree=3, \
                                                C=10,epsilon=1e-5,num_threads=1):
    
    from shogun.Features import StringCharFeatures, BinaryLabels, DNA
    from shogun.Kernel import WeightedDegreeStringKernel
    from shogun.Classifier import SVMLight
    
    feats_train=StringCharFeatures(DNA)
    feats_train.set_features(fm_train_dna)
    feats_test=StringCharFeatures(DNA)
    feats_test.set_features(fm_test_dna)
    
    kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)
    
    labels=BinaryLabels(label_train_dna)
    
    svm=SVMLight(C, kernel, labels)
    svm.set_qpsize(3)
    svm.set_linear_term(-numpy.array([1,2,3,4,5,6,7,8,7,6], dtype=numpy.double));
    svm.set_epsilon(epsilon)
    svm.parallel.set_num_threads(num_threads)
    svm.train()
    
    kernel.init(feats_train, feats_test)
    out = svm.apply().get_labels()
    return out,kernel
def classifier_domainadaptationsvm_modular(fm_train_dna=traindna,fm_test_dna=testdna, \
                                                label_train_dna=label_traindna, \
                                               label_test_dna=label_testdna,fm_train_dna2=traindna2,fm_test_dna2=testdna2, \
                                               label_train_dna2=label_traindna2,label_test_dna2=label_testdna2,C=1,degree=3):

    feats_train = StringCharFeatures(fm_train_dna, DNA)
    feats_test = StringCharFeatures(fm_test_dna, DNA)
    kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree)
    labels = BinaryLabels(label_train_dna)
    svm = SVMLight(C, kernel, labels)
    svm.train()
    #svm.io.set_loglevel(MSG_DEBUG)

    #####################################

    #print("obtaining DA SVM from previously trained SVM")

    feats_train2 = StringCharFeatures(fm_train_dna, DNA)
    feats_test2 = StringCharFeatures(fm_test_dna, DNA)
    kernel2 = WeightedDegreeStringKernel(feats_train, feats_train, degree)
    labels2 = BinaryLabels(label_train_dna)

    # we regularize against the previously obtained solution
    dasvm = DomainAdaptationSVM(C, kernel2, labels2, svm, 1.0)
    dasvm.train()

    out = dasvm.apply_binary(feats_test2)

    return out  #,dasvm TODO
def classifier_domainadaptationsvm_modular(fm_train_dna=traindna,fm_test_dna=testdna, \
                                                label_train_dna=label_traindna, \
                                               label_test_dna=label_testdna,fm_train_dna2=traindna2,fm_test_dna2=testdna2, \
                                               label_train_dna2=label_traindna2,label_test_dna2=label_testdna2,C=1,degree=3):



    
	feats_train = StringCharFeatures(fm_train_dna, DNA)
	feats_test = StringCharFeatures(fm_test_dna, DNA)
	kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree)
	labels = Labels(label_train_dna)
	svm = SVMLight(C, kernel, labels)
	svm.train()
	#svm.io.set_loglevel(MSG_DEBUG)
    
	#####################################
		
	#print "obtaining DA SVM from previously trained SVM"

	feats_train2 = StringCharFeatures(fm_train_dna, DNA)
	feats_test2 = StringCharFeatures(fm_test_dna, DNA)
	kernel2 = WeightedDegreeStringKernel(feats_train, feats_train, degree)
	labels2 = Labels(label_train_dna)

	# we regularize against the previously obtained solution
	dasvm = DomainAdaptationSVM(C, kernel2, labels2, svm, 1.0)
	dasvm.train()

	out = dasvm.apply(feats_test2).get_labels()

	return out #,dasvm TODO
示例#4
0
def svm_learn(kernel, labels, options):
    """train SVM using SVMLight or LibSVM

	Arguments:
	kernel -- kernel object from Shogun toolbox
	lebels -- list of labels
	options -- object containing option data 

	Return:
	trained svm object 
	"""

    try:
        svm = SVMLight(options.svmC, kernel,
                       Labels(numpy.array(labels, dtype=numpy.double)))
    except NameError:
        svm = LibSVM(options.svmC, kernel,
                     Labels(numpy.array(labels, dtype=numpy.double)))

    if options.quiet == False:
        svm.io.set_loglevel(MSG_INFO)
        svm.io.set_target_to_stderr()

    svm.set_epsilon(options.epsilon)
    svm.parallel.set_num_threads(1)
    if options.weight != 1.0:
        svm.set_C(options.svmC, options.svmC * options.weight)
    svm.train()

    if options.quiet == False:
        svm.io.set_loglevel(MSG_ERROR)

    return svm
示例#5
0
    def train(self, data, labels):
        """
        model training 
        """

        # centered WDK/WDK-shift
        if self.param["shifts"] == 0:
            kernel_center = WeightedDegreeStringKernel(self.param["degree"])
        else:
            kernel_center = WeightedDegreePositionStringKernel(
                10, self.param["degree"])
            shifts_vector = numpy.ones(
                self.param["center_offset"] * 2,
                dtype=numpy.int32) * self.param["shifts"]
            kernel_center.set_shifts(shifts_vector)

        kernel_center.set_cache_size(self.param["kernel_cache"] / 3)

        # border spetrum kernels
        size = self.param["kernel_cache"] / 3
        use_sign = False
        kernel_left = WeightedCommWordStringKernel(size, use_sign)
        kernel_right = WeightedCommWordStringKernel(size, use_sign)

        # assemble combined kernel
        kernel = CombinedKernel()
        kernel.append_kernel(kernel_center)
        kernel.append_kernel(kernel_left)
        kernel.append_kernel(kernel_right)

        ## building features
        feat = create_features(data, self.param["center_offset"],
                               self.param["center_pos"])

        # init combined kernel
        kernel.init(feat, feat)

        print "len(labels) = %i" % (len(labels))
        lab = BinaryLabels(numpy.double(labels))
        self.svm = SVMLight(self.param["cost"], kernel, lab)

        # show debugging output
        self.svm.io.enable_progress()
        self.svm.io.set_loglevel(MSG_DEBUG)

        # optimization settings
        num_threads = 2
        self.svm.parallel.set_num_threads(num_threads)
        self.svm.set_epsilon(10e-8)

        self.svm.train()

        return self
def classifier_svmlight_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,C=1.2,epsilon=1e-5,num_threads=1):
	from shogun.Features import StringCharFeatures, Labels, DNA
	from shogun.Kernel import WeightedDegreeStringKernel
	try:
		from shogun.Classifier import SVMLight
	except ImportError:
		print 'No support for SVMLight available.'
		return

	feats_train=StringCharFeatures(DNA)
	feats_train.set_features(fm_train_dna)
	feats_test=StringCharFeatures(DNA)
	feats_test.set_features(fm_test_dna)
	degree=20

	kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)

	labels=Labels(label_train_dna)

	svm=SVMLight(C, kernel, labels)
	svm.set_epsilon(epsilon)
	svm.parallel.set_num_threads(num_threads)
	svm.train()

	kernel.init(feats_train, feats_test)
	svm.apply().get_labels()
	return kernel
示例#7
0
    def _train_single_svm(self, param, kernel, lab):
    

    
        kernel.set_cache_size(500)
        #lab = shogun_factory.create_labels(data.labels) 
        svm = SVMLight(param.cost, kernel, lab)

        # set up SVM
        num_threads = 8
        svm.io.enable_progress()
        svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG)
        
        svm.parallel.set_num_threads(num_threads)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)
            
        # normalize cost
        #norm_c_pos = param.cost / float(len([l for l in data.labels if l==1]))
        #norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1]))

        #svm.set_C(norm_c_neg, norm_c_pos)
        
        
        # start training
        svm.train()

        return svm
def svm_learn(kernel, labels, options):
	"""train SVM using SVMLight or LibSVM

	Arguments:
	kernel -- kernel object from Shogun toolbox
	lebels -- list of labels
	options -- object containing option data 

	Return:
	trained svm object 
	"""

	try: 
		svm=SVMLight(options.svmC, kernel, Labels(numpy.array(labels, dtype=numpy.double)))
	except NameError:
		svm=LibSVM(options.svmC, kernel, Labels(numpy.array(labels, dtype=numpy.double)))

	if options.quiet == False:
		svm.io.set_loglevel(MSG_INFO)
		svm.io.set_target_to_stderr()

	svm.set_epsilon(options.epsilon)
	svm.parallel.set_num_threads(1)
	if options.weight != 1.0:
		svm.set_C(options.svmC, options.svmC*options.weight)
	svm.train()

	if options.quiet == False:
		svm.io.set_loglevel(MSG_ERROR)

	return svm
def create_svm(param, data, lab):
    """
    create SVM object with standard settings
    
    @param param: parameter object
    @param data: kernel or feature object (for kernelized/linear svm)
    @param lab: label object
    
    @return: svm object
    """

    # create SVM
    if param.flags.has_key(
            "svm_type") and param.flags["svm_type"] == "liblineardual":
        print "creating LibLinear object"
        svm = LibLinear(param.cost, data, lab)
        svm.set_liblinear_solver_type(L2R_L2LOSS_SVC_DUAL)

        # set solver type
        if param.flags.has_key(
                "solver_type") and param.flags["solver_type"] == "L2R_LR":
            print "setting linear solver type to: L2R_LR"
            svm.set_liblinear_solver_type(L2R_LR)

    else:
        print "creating SVMLight object"
        svm = SVMLight(param.cost, data, lab)

    return set_svm_parameters(svm, param)
示例#10
0
文件: model.py 项目: kuod/genomeutils
class ShogunPredictor(object):
    """
    basic single-task promoter model using string kernels
    """

    def __init__(self, degree=4, shifts=32, kernel_cache=10000, cost=1.0):
        #TODO: clean up degree
        self.degree = degree
        self.degree_wdk = degree
        self.degree_spectrum = degree
        self.shifts = shifts
        self.kernel_cache = kernel_cache
        self.cost = cost
        self.center_offset = 50
        self.center_pos = 1200
        self.epsilon = 10e-2
        self.num_threads = 4


    def train(self, data, labels):

        kernel = create_promoter_kernel(data, self.center_offset, self.center_pos, self.degree_wdk, self.degree_spectrum, self.shifts, kernel_cache=self.kernel_cache)

        print "len(labels) = %i" % (len(labels))
        lab = create_labels(labels)
        self.svm = SVMLight(self.cost, kernel, lab)

        # show debugging output
        self.svm.io.enable_progress()
        self.svm.io.set_loglevel(MSG_DEBUG)

        # optimization settings
        num_threads = self.num_threads
        self.svm.parallel.set_num_threads(num_threads)
        self.svm.set_epsilon(self.epsilon)

        self.svm.train()

        return self


    def predict(self, data):

        feat = create_promoter_features(data, self.center_offset, self.center_pos)
        out = self.svm.apply(feat).get_values()

        return out
def svm_light ():
	print 'SVMLight'

	from shogun.Features import StringCharFeatures, Labels, DNA
	from shogun.Kernel import WeightedDegreeStringKernel
	try:
		from shogun.Classifier import SVMLight
	except ImportError:
		print 'No support for SVMLight available.'
		return

	feats_train=StringCharFeatures(DNA)
	feats_train.set_features(fm_train_dna)
	feats_test=StringCharFeatures(DNA)
	feats_test.set_features(fm_test_dna)
	degree=20

	kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)

	C=1.2
	epsilon=1e-5
	num_threads=1
	labels=Labels(label_train_dna)

	svm=SVMLight(C, kernel, labels)
	svm.set_epsilon(epsilon)
	svm.parallel.set_num_threads(num_threads)
	svm.train()

	kernel.init(feats_train, feats_test)
	svm.classify().get_labels()
示例#12
0
    def _train_single_svm(self, param, kernel, lab):
    

    
        kernel.set_cache_size(500)
        #lab = shogun_factory.create_labels(data.labels) 
        svm = SVMLight(param.cost, kernel, lab)

        # set up SVM
        num_threads = 8
        svm.io.enable_progress()
        svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG)
        
        svm.parallel.set_num_threads(num_threads)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)
            
        # normalize cost
        #norm_c_pos = param.cost / float(len([l for l in data.labels if l==1]))
        #norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1]))

        #svm.set_C(norm_c_neg, norm_c_pos)
        
        
        # start training
        svm.train()

        return svm
示例#13
0
    def train(self, data, labels):
        """
        model training 
        """

        # centered WDK/WDK-shift
        if self.param["shifts"] == 0:
            kernel_center = WeightedDegreeStringKernel(self.param["degree"])
        else:
            kernel_center = WeightedDegreePositionStringKernel(10, self.param["degree"])
            shifts_vector = numpy.ones(self.param["center_offset"]*2, dtype=numpy.int32)*self.param["shifts"]
            kernel_center.set_shifts(shifts_vector)

        kernel_center.set_cache_size(self.param["kernel_cache"]/3)

        # border spetrum kernels
        size = self.param["kernel_cache"]/3
        use_sign = False
        kernel_left = WeightedCommWordStringKernel(size, use_sign)
        kernel_right = WeightedCommWordStringKernel(size, use_sign)
        
        # assemble combined kernel
        kernel = CombinedKernel()
        kernel.append_kernel(kernel_center)
        kernel.append_kernel(kernel_left)
        kernel.append_kernel(kernel_right)

        ## building features 
        feat = create_features(data, self.param["center_offset"], self.param["center_pos"])
        
        # init combined kernel
        kernel.init(feat, feat)

        print "len(labels) = %i" % (len(labels))
        lab = BinaryLabels(numpy.double(labels))
        self.svm = SVMLight(self.param["cost"], kernel, lab)

        # show debugging output
        self.svm.io.enable_progress()
        self.svm.io.set_loglevel(MSG_DEBUG)

        # optimization settings
        num_threads = 2
        self.svm.parallel.set_num_threads(num_threads)
        self.svm.set_epsilon(10e-8)

        self.svm.train()

        return self
示例#14
0
def svm_learn(kernel, labels, svmC, epsilon, weight):
    """
	"""
    try:
        svm = SVMLight(svmC, kernel,
                       Labels(numpy.array(labels, dtype=numpy.double)))
    except NameError:
        print 'No support for SVMLight available.'
        return

    svm.io.set_loglevel(MSG_INFO)
    svm.io.set_target_to_stderr()

    svm.set_epsilon(epsilon)
    svm.parallel.set_num_threads(1)
    if weight != 1.0:
        svm.set_C(svmC, svmC * weight)
    svm.train()
    svm.io.set_loglevel(MSG_ERROR)

    return svm
示例#15
0
文件: model.py 项目: kuod/genomeutils
    def train(self, data, labels):

        kernel = create_promoter_kernel(data, self.center_offset, self.center_pos, self.degree_wdk, self.degree_spectrum, self.shifts, kernel_cache=self.kernel_cache)

        print "len(labels) = %i" % (len(labels))
        lab = create_labels(labels)
        self.svm = SVMLight(self.cost, kernel, lab)

        # show debugging output
        self.svm.io.enable_progress()
        self.svm.io.set_loglevel(MSG_DEBUG)

        # optimization settings
        num_threads = self.num_threads
        self.svm.parallel.set_num_threads(num_threads)
        self.svm.set_epsilon(self.epsilon)

        self.svm.train()

        return self
示例#16
0
def svm_learn(kernel, labels, svmC, epsilon, weight):
	"""
	"""
	try: 
		svm=SVMLight(svmC, kernel, Labels(numpy.array(labels, dtype=numpy.double)))
	except NameError:
		print 'No support for SVMLight available.'
		return

	svm.io.set_loglevel(MSG_INFO)
	svm.io.set_target_to_stderr()

	svm.set_epsilon(epsilon)
	svm.parallel.set_num_threads(1)
	if weight != 1.0:
		svm.set_C(svmC, svmC*weight)
	svm.train()
	svm.io.set_loglevel(MSG_ERROR)

	return svm
def serialization_svmlight_modular(num, dist, width, C):
    from shogun.IO import MSG_DEBUG
    from shogun.Features import RealFeatures, BinaryLabels, DNA, Alphabet
    from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel
    from shogun.Classifier import SVMLight
    from numpy import concatenate, ones
    from numpy.random import randn, seed

    import sys
    import types
    import random
    import bz2
    try:
        import cPickle as pickle
    except ImportError:
        import pickle as pickle
    import inspect

    def save(filename, myobj):
        """
		save object to file using pickle
		
		@param filename: name of destination file
		@type filename: str
		@param myobj: object to save (has to be pickleable)
		@type myobj: obj
		"""

        try:
            f = bz2.BZ2File(filename, 'wb')
        except IOError as details:
            sys.stderr.write('File ' + filename + ' cannot be written\n')
            sys.stderr.write(details)
            return

        pickle.dump(myobj, f, protocol=2)
        f.close()

    def load(filename):
        """
		Load from filename using pickle
		
		@param filename: name of file to load from
		@type filename: str
		"""

        try:
            f = bz2.BZ2File(filename, 'rb')
        except IOError as details:
            sys.stderr.write('File ' + filename + ' cannot be read\n')
            sys.stderr.write(details)
            return

        myobj = pickle.load(f)
        f.close()
        return myobj

    ##################################################

    seed(17)
    traindata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist),
                                 axis=1)
    testdata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist),
                                axis=1)

    trainlab = concatenate((-ones(num), ones(num)))
    testlab = concatenate((-ones(num), ones(num)))

    feats_train = RealFeatures(traindata_real)
    feats_test = RealFeatures(testdata_real)
    kernel = GaussianKernel(feats_train, feats_train, width)
    #kernel.io.set_loglevel(MSG_DEBUG)

    labels = BinaryLabels(trainlab)

    svm = SVMLight(C, kernel, labels)
    svm.train()
    #svm.io.set_loglevel(MSG_DEBUG)

    ##################################################

    #print("labels:")
    #print(pickle.dumps(labels))
    #
    #print("features")
    #print(pickle.dumps(feats_train))
    #
    #print("kernel")
    #print(pickle.dumps(kernel))
    #
    #print("svm")
    #print(pickle.dumps(svm))
    #
    #print("#################################")

    fn = "serialized_svm.bz2"
    #print("serializing SVM to file", fn)

    save(fn, svm)

    #print("#################################")

    #print("unserializing SVM")
    svm2 = load(fn)

    #print("#################################")
    #print("comparing training")

    svm2.train()

    #print("objective before serialization:", svm.get_objective())
    #print("objective after serialization:", svm2.get_objective())
    return svm, svm.get_objective(), svm2, svm2.get_objective()
示例#18
0
    def _inner_train(self, train_data, param):
        """
        perform inner training by processing the tree
        """

        data_keys = []
        # top-down processing of taxonomy


        classifiers = []
        classifier_at_node = {}

        root = param.taxonomy.data

        grey_nodes = [root]
        
        while len(grey_nodes)>0:
           
            node = grey_nodes.pop(0) # pop first item
            
            # enqueue children
            if node.children != None:
                grey_nodes.extend(node.children)
    

    
            #####################################################
            #     init data structures
            #####################################################

            # get data below current node
            data = [train_data[key] for key in node.get_data_keys()]
            
            data_keys.append(node.get_data_keys())
    
            print "data at current level"
            for instance_set in data:        
                print instance_set[0].dataset
            
            
            # initialize containers
            examples = []
            labels = []       
    

            # concatenate data
            for instance_set in data:
      
                print "train split_set:", instance_set[0].dataset.organism
                
                for inst in instance_set:
                    examples.append(inst.example)
                    labels.append(inst.label)
    

            # create shogun data objects
            k = shogun_factory.create_kernel(examples, param)
            lab = shogun_factory.create_labels(labels)


            #####################################################
            #    train weak learners    
            #####################################################
            
            cost = param.cost
            
            # set up svm
            svm = SVMLight(cost, k, lab)
                        
            if param.flags["normalize_cost"]:
                # set class-specific Cs
                norm_c_pos = param.cost / float(len([l for l in labels if l==1]))
                norm_c_neg = param.cost / float(len([l for l in labels if l==-1]))
                svm.set_C(norm_c_neg, norm_c_pos)
            
            
            print "using cost: negative class=%f, positive class=%f" % (norm_c_neg, norm_c_pos) 
            
            # enable output
            svm.io.enable_progress()
            svm.io.set_loglevel(shogun.Classifier.MSG_INFO)
            
            # train
            svm.train()
            
            # append svm object
            classifiers.append(svm)
            classifier_at_node[node.name] = svm                            
            
            # save some information
            self.additional_information[node.name + " svm obj"] = svm.get_objective()
            self.additional_information[node.name + " svm num sv"] = svm.get_num_support_vectors()
            self.additional_information[node.name + " runtime"] = svm.get_runtime()


        return (classifiers, classifier_at_node)
示例#19
0
    traindata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist),
                                 axis=1)
    testdata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist),
                                axis=1)

    trainlab = concatenate((-ones(num), ones(num)))
    testlab = concatenate((-ones(num), ones(num)))

    feats_train = RealFeatures(traindata_real)
    feats_test = RealFeatures(testdata_real)
    kernel = GaussianKernel(feats_train, feats_train, width)
    #kernel.io.set_loglevel(MSG_DEBUG)

    labels = Labels(trainlab)

    svm = SVMLight(C, kernel, labels)
    svm.train()
    #svm.io.set_loglevel(MSG_DEBUG)

    ##################################################

    #print "labels:"
    #print pickle.dumps(labels)
    #
    #print "features"
    #print pickle.dumps(feats_train)
    #
    #print "kernel"
    #print pickle.dumps(kernel)
    #
    #print "svm"
示例#20
0
def solver_mtk_shogun(C, all_xt, all_lt, task_indicator, M, L, eps,
                      target_obj):
    """
    implementation using multitask kernel
    """

    xt = numpy.array(all_xt)
    lt = numpy.array(all_lt)
    tt = numpy.array(task_indicator, dtype=numpy.int32)
    tsm = numpy.array(M)

    print "task_sim:", tsm

    num_tasks = L.shape[0]

    # sanity checks
    assert len(xt) == len(lt) == len(tt)
    assert M.shape == L.shape
    assert num_tasks == len(set(tt))

    # set up shogun objects
    if type(xt[0]) == numpy.string_:
        feat = StringCharFeatures(DNA)
        xt = [str(a) for a in xt]
        feat.set_features(xt)
        base_kernel = WeightedDegreeStringKernel(feat, feat, 8)
    else:
        feat = RealFeatures(xt.T)
        base_kernel = LinearKernel(feat, feat)

    lab = Labels(lt)

    # set up normalizer
    normalizer = MultitaskKernelNormalizer(tt.tolist())

    for i in xrange(num_tasks):
        for j in xrange(num_tasks):
            normalizer.set_task_similarity(i, j, M[i, j])

    print "num of unique tasks: ", normalizer.get_num_unique_tasks(
        task_indicator)

    # set up kernel
    base_kernel.set_cache_size(2000)
    base_kernel.set_normalizer(normalizer)
    base_kernel.init_normalizer()

    # set up svm
    svm = SVMLight()  #LibSVM()

    svm.set_epsilon(eps)
    #print "reducing num threads to one"
    #svm.parallel.set_num_threads(1)
    #print "using one thread"

    # how often do we like to compute objective etc
    svm.set_record_interval(0)
    svm.set_target_objective(target_obj)

    svm.set_linadd_enabled(False)
    svm.set_batch_computation_enabled(False)
    svm.io.set_loglevel(MSG_DEBUG)
    #SET THREADS TO 1

    svm.set_C(C, C)
    svm.set_bias_enabled(False)

    # prepare for training
    svm.set_labels(lab)
    svm.set_kernel(base_kernel)

    # train svm
    svm.train()

    train_times = svm.get_training_times()
    objectives = [-obj for obj in svm.get_dual_objectives()]

    if False:

        # get model parameters
        sv_idx = svm.get_support_vectors()
        sparse_alphas = svm.get_alphas()

        assert len(sv_idx) == len(sparse_alphas)

        # compute dense alpha (remove label)
        alphas = numpy.zeros(len(xt))
        for id_sparse, id_dense in enumerate(sv_idx):
            alphas[id_dense] = sparse_alphas[id_sparse] * lt[id_dense]

        # print alphas
        W = alphas_to_w(alphas, xt, lt, task_indicator, M)
        primal_obj = compute_primal_objective(
            W.reshape(W.shape[0] * W.shape[1]), C, all_xt, all_lt,
            task_indicator, L)
        objectives.append(primal_obj)
        train_times.append(train_times[-1] + 100)

    return objectives, train_times
示例#21
0
for i in xrange(2):
    for j in xrange(2):

        if i==j:
            normalizer.set_task_similarity(i,j, 4.0)
        else:
            normalizer.set_task_similarity(i,j, 1.0)


base_wdk.set_normalizer(normalizer)

print base_wdk.get_kernel_matrix()
print "--->",base_wdk.get_normalizer().get_name()

svm = SVMLight(1, base_wdk, lab)
svm.set_linadd_enabled(False)
svm.set_batch_computation_enabled(False)

svm.train(feat)

print "interally modified kernel. objective:", svm.get_objective()



##################################################################
# regular SVM
##################################################################


wdk = WeightedDegreeStringKernel(feat, feat, 1)
def serialization_string_kernels_modular(n_data, num_shifts, size):
    """
    serialize svm with string kernels
    """

    ##################################################
    # set up toy data and svm
    train_xt, train_lt = generate_random_data(n_data)
    test_xt, test_lt = generate_random_data(n_data)

    feats_train = construct_features(train_xt)
    feats_test = construct_features(test_xt)

    max_len = len(train_xt[0])
    kernel_wdk = WeightedDegreePositionStringKernel(size, 5)
    shifts_vector = numpy.ones(max_len, dtype=numpy.int32)*num_shifts
    kernel_wdk.set_shifts(shifts_vector)

    ########
    # set up spectrum
    use_sign = False
    kernel_spec_1 = WeightedCommWordStringKernel(size, use_sign)
    kernel_spec_2 = WeightedCommWordStringKernel(size, use_sign)

    ########
    # combined kernel
    kernel = CombinedKernel()
    kernel.append_kernel(kernel_wdk)
    kernel.append_kernel(kernel_spec_1)
    kernel.append_kernel(kernel_spec_2)

    # init kernel
    labels = BinaryLabels(train_lt);

    svm = SVMLight(1.0, kernel, labels)
    #svm.io.set_loglevel(MSG_DEBUG)
    svm.train(feats_train)

    ##################################################
    # serialize to file

    fn = "serialized_svm.bz2"
    #print("serializing SVM to file", fn)
    save(fn, svm)

    ##################################################
    # unserialize and sanity check

    #print("unserializing SVM")
    svm2 = load(fn)


    #print("comparing predictions")
    out =  svm.apply(feats_test).get_labels()
    out2 =  svm2.apply(feats_test).get_labels()

    # assert outputs are close
    for i in range(len(out)):
        assert abs(out[i] - out2[i] < 0.000001)

    #print("all checks passed.")

    return out,out2
示例#23
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)

        # create shogun data objects
        base_wdk = shogun_factory.create_empty_kernel(param)
        lab = shogun_factory.create_labels(data.labels)

        combined_kernel = CombinedKernel()
        combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO)
        base_features = shogun_factory.create_features(data.examples)
        combined_features = CombinedFeatures()

        # set normalizer
        normalizer = MultitaskKernelNormalizer(data.task_vector_nums)

        # load data
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt")
        f = file(
            "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt"
        )
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt")
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt")

        num_lines = int(f.readline().strip())
        task_distances = numpy.zeros((num_lines, num_lines))
        name_to_id = {}
        for (i, line) in enumerate(f):
            tokens = line.strip().split("\t")
            name = str(tokens[0])
            name_to_id[name] = i
            entry = numpy.array([v for (j, v) in enumerate(tokens) if j != 0])
            assert len(entry) == num_lines, "len_entry %i, num_lines %i" % (
                len(entry), num_lines)
            task_distances[i, :] = entry

        # cut relevant submatrix
        active_ids = [name_to_id[name] for name in data.get_task_names()]
        tmp_distances = task_distances[active_ids, :]
        tmp_distances = tmp_distances[:, active_ids]
        print "distances ", tmp_distances.shape

        # normalize distances
        task_distances = task_distances / numpy.max(tmp_distances)

        similarities = numpy.zeros(
            (data.get_num_tasks(), data.get_num_tasks()))

        # convert distance to similarity
        for task_name_lhs in data.get_task_names():
            for task_name_rhs in data.get_task_names():

                # convert similarity with simple transformation
                similarity = param.base_similarity - task_distances[
                    name_to_id[task_name_lhs], name_to_id[task_name_rhs]]
                normalizer.set_task_similarity(data.name_to_id(task_name_lhs),
                                               data.name_to_id(task_name_rhs),
                                               similarity)

                # save for later
                similarities[data.name_to_id(task_name_lhs),
                             data.name_to_id(task_name_rhs)] = similarity

        # set normalizer
        base_wdk.set_normalizer(normalizer)
        #base_wdk.init_normalizer()

        combined_features.append_feature_obj(base_features)
        combined_kernel.append_kernel(base_wdk)

        ##################################################
        # intra-domain blocks

        intra_block_vec = PairiiVec()

        for task_id in data.get_task_ids():
            intra_block_vec.push_back(Pairii(task_id, task_id))

        # create mask-based normalizer
        normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums,
                                                       intra_block_vec)
        kernel = shogun_factory.create_empty_kernel(param)
        kernel.set_normalizer(normalizer)

        # append current kernel to CombinedKernel
        combined_kernel.append_kernel(kernel)

        # append features
        combined_features.append_feature_obj(base_features)

        # set mixing factor (used if MKL is OFF)
        assert (param.base_similarity <= 1)
        assert (param.base_similarity >= 0)
        combined_kernel.set_subkernel_weights(
            [param.base_similarity, 1 - param.base_similarity])

        combined_kernel.init(combined_features, combined_features)

        svm = None

        print "using MKL:", (param.transform >= 1.0)

        if param.transform >= 1.0:

            svm = MKLClassification()

            svm.set_mkl_norm(param.transform)
            #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto

            svm.set_C(param.cost, param.cost)

            svm.set_kernel(combined_kernel)
            svm.set_labels(lab)

        else:

            # create SVM (disable unsupported optimizations)
            combined_kernel.set_cache_size(500)

            svm = SVMLight(param.cost, combined_kernel, lab)

        # set up SVM
        num_threads = 8
        svm.io.enable_progress()
        svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG)

        svm.parallel.set_num_threads(num_threads)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)

        print "WARNING: custom epsilon set"
        svm.set_epsilon(0.05)

        # normalize cost
        norm_c_pos = param.cost / float(len([l
                                             for l in data.labels if l == 1]))
        norm_c_neg = param.cost / float(
            len([l for l in data.labels if l == -1]))

        svm.set_C(norm_c_neg, norm_c_pos)

        # start training
        svm.train()

        # save additional information
        self.additional_information["svm objective"] = svm.get_objective()
        self.additional_information["num sv"] = svm.get_num_support_vectors()
        self.additional_information["similarities"] = similarities
        self.additional_information[
            "post_weights"] = combined_kernel.get_subkernel_weights()

        # wrap up predictors
        svms = {}

        # use a reference to the same svm several times
        for task_name in data.get_task_names():

            task_num = data.name_to_id(task_name)

            # save svm and task_num
            svms[task_name] = (task_num, combined_kernel, svm)

        return svms
from numpy import array, float64
import sys

# create dense matrices A,B,C
A=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=float64)
B=array([1,1,1,-1,-1,-1], dtype=float64)


# ... of type Real, LongInt and Byte
feats_train = RealFeatures(A.transpose())
kernel = GaussianKernel(feats_train, feats_train, 1.0)
kernel.io.set_loglevel(MSG_DEBUG)

lab = Labels(B)

svm = SVMLight(1, kernel, lab)
svm.train()


helper.save("/tmp/awesome_svm", svm)
svm = helper.load("/tmp/awesome_svm")

svm.train()


#sys.exit(0)


run = expenv.Run.get(1010)
#run = expenv.Run.get(974)
dat = run.get_train_data()
                                       data.name_to_id(t2_name), similarity)

##################################################################
# Train SVMs
##################################################################

# create shogun objects
wdk_tree = shogun_factory.create_kernel(data.examples, param)
lab = shogun_factory.create_labels(data.labels)

wdk_tree.set_normalizer(tree_normalizer)
wdk_tree.init_normalizer()

print "--->", wdk_tree.get_normalizer().get_name()

svm_tree = SVMLight(cost, wdk_tree, lab)
svm_tree.set_linadd_enabled(False)
svm_tree.set_batch_computation_enabled(False)

svm_tree.train()

del wdk_tree
del tree_normalizer

print "finished training tree-norm SVM:", svm_tree.get_objective()

wdk = shogun_factory.create_kernel(data.examples, param)
wdk.set_normalizer(normalizer)
wdk.init_normalizer()

print "--->", wdk.get_normalizer().get_name()
示例#26
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)

        # create shogun data objects
        base_wdk = shogun_factory.create_kernel(data.examples, param)
        lab = shogun_factory.create_labels(data.labels)

        # set normalizer
        normalizer = MultitaskKernelNormalizer(data.task_vector_nums)

        ########################################################
        print "creating a kernel for each node:"
        ########################################################

        # init seq handler
        task_kernel = SequencesHandlerRbf(1, param.base_similarity,
                                          data.get_task_names(),
                                          param.flags["wdk_rbf_on"])
        similarities = numpy.zeros(
            (data.get_num_tasks(), data.get_num_tasks()))

        # convert distance to similarity
        for task_name_lhs in data.get_task_names():
            for task_name_rhs in data.get_task_names():

                # convert similarity with simple transformation
                similarity = task_kernel.get_similarity(
                    task_name_lhs, task_name_rhs)

                print similarity

                print "similarity (%s,%s)=%f" % (task_name_lhs, task_name_rhs,
                                                 similarity)

                normalizer.set_task_similarity(data.name_to_id(task_name_lhs),
                                               data.name_to_id(task_name_rhs),
                                               similarity)

                # save for later
                similarities[data.name_to_id(task_name_lhs),
                             data.name_to_id(task_name_rhs)] = similarity

        # set normalizer
        base_wdk.set_normalizer(normalizer)
        base_wdk.init_normalizer()

        # set up svm
        svm = SVMLight(param.cost, base_wdk, lab)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)

        # normalize cost
        norm_c_pos = param.cost / float(len([l
                                             for l in data.labels if l == 1]))
        norm_c_neg = param.cost / float(
            len([l for l in data.labels if l == -1]))

        svm.set_C(norm_c_neg, norm_c_pos)

        # start training
        svm.train()

        # save additional information
        self.additional_information["svm objective"] = svm.get_objective()
        self.additional_information["num sv"] = svm.get_num_support_vectors()
        #self.additional_information["distances"] = distances
        self.additional_information["similarities"] = similarities

        # wrap up predictors
        svms = {}

        # use a reference to the same svm several times
        for task_name in data.get_task_names():

            task_num = data.name_to_id(task_name)

            # save svm and task_num
            svms[task_name] = (task_num, param, svm)

        return svms
示例#27
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        assert (param.base_similarity >= 1)

        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)

        # create shogun data objects
        base_wdk = shogun_factory.create_kernel(data.examples, param)
        lab = shogun_factory.create_labels(data.labels)

        # set normalizer
        normalizer = MultitaskKernelNormalizer(data.task_vector_nums)

        # load data
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt")
        f = file(
            "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt"
        )
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt")
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt")

        num_lines = int(f.readline().strip())
        task_distances = numpy.zeros((num_lines, num_lines))
        name_to_id = {}
        for (i, line) in enumerate(f):
            tokens = line.strip().split("\t")
            name = str(tokens[0])
            name_to_id[name] = i
            entry = numpy.array([v for (j, v) in enumerate(tokens) if j != 0])
            assert len(entry) == num_lines, "len_entry %i, num_lines %i" % (
                len(entry), num_lines)
            task_distances[i, :] = entry

        # cut relevant submatrix
        active_ids = [name_to_id[name] for name in data.get_task_names()]
        tmp_distances = task_distances[active_ids, :]
        tmp_distances = tmp_distances[:, active_ids]
        print "distances ", tmp_distances.shape

        # normalize distances
        task_distances = task_distances / numpy.max(tmp_distances)

        similarities = numpy.zeros(
            (data.get_num_tasks(), data.get_num_tasks()))

        # convert distance to similarity
        for task_name_lhs in data.get_task_names():
            for task_name_rhs in data.get_task_names():

                # convert similarity with simple transformation
                similarity = param.base_similarity - task_distances[
                    name_to_id[task_name_lhs], name_to_id[task_name_rhs]]
                normalizer.set_task_similarity(data.name_to_id(task_name_lhs),
                                               data.name_to_id(task_name_rhs),
                                               similarity)

                # save for later
                similarities[data.name_to_id(task_name_lhs),
                             data.name_to_id(task_name_rhs)] = similarity

        # set normalizer
        base_wdk.set_normalizer(normalizer)
        base_wdk.init_normalizer()

        # set up svm
        svm = SVMLight(param.cost, base_wdk, lab)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)

        # normalize cost
        norm_c_pos = param.cost / float(len([l
                                             for l in data.labels if l == 1]))
        norm_c_neg = param.cost / float(
            len([l for l in data.labels if l == -1]))

        svm.set_C(norm_c_neg, norm_c_pos)

        # start training
        svm.train()

        # save additional information
        self.additional_information["svm objective"] = svm.get_objective()
        self.additional_information["num sv"] = svm.get_num_support_vectors()
        #self.additional_information["distances"] = distances
        self.additional_information["similarities"] = similarities

        # wrap up predictors
        svms = {}

        # use a reference to the same svm several times
        for task_name in data.get_task_names():

            task_num = data.name_to_id(task_name)

            # save svm and task_num
            svms[task_name] = (task_num, param, svm)

        return svms
示例#28
0
labels_presvm = [i.label for i in d[0:subset_size]]

labels_presvm[2] = 1
labels_presvm[12] = 1
labels_presvm[15] = 1
labels_presvm[8] = 1
labels_presvm[19] = 1


feat_presvm = StringCharFeatures(DNA)
feat_presvm.set_features(examples_presvm)
wdk_presvm = WeightedDegreeStringKernel(feat_presvm, feat_presvm, 1)
lab_presvm = Labels(numpy.array(labels_presvm))


presvm = SVMLight(1, wdk_presvm, lab_presvm)
presvm.train()

presvm2 = LibSVM(1, wdk_presvm, lab_presvm)
presvm2.train()

print "svmlight", presvm.get_objective()
print "libsvm", presvm2.get_objective()

assert(abs(presvm.get_objective() - presvm2.get_objective())<= 0.001)

print "simple svm", presvm.get_objective()

print "len(examples_presvm)", len(examples_presvm)

print "##############"
示例#29
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """
        
          
        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=True)
        
        # create shogun label
        lab = shogun_factory.create_labels(data.labels)
        


        ########################################################
        print "creating a kernel for each node:"
        ########################################################


        # assemble combined kernel
        
        combined_kernel = CombinedKernel()
        
        combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO)
        
        
        base_features = shogun_factory.create_features(data.examples)
        
        combined_features = CombinedFeatures()
        
        
        
        
        ##################################################
        # intra-domain blocks
        
        
        #        intra_block_vec = PairiiVec()
        #        
        #        for task_id in data.get_task_ids():
        #            intra_block_vec.push_back(Pairii(task_id, task_id))
        #        
        #        
        #        
        #        # create mask-based normalizer
        #        normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec)        
        #        kernel = shogun_factory.create_empty_kernel(param)
        #        kernel.set_normalizer(normalizer)
        #        
        #        # append current kernel to CombinedKernel
        #        combined_kernel.append_kernel(kernel)
        #    
        #        # append features
        #        combined_features.append_feature_obj(base_features)
        #
        #        print "------"
        #        
        #        ##################################################
        #        # all blocks
        #        
        #        
        #        all_block_vec = PairiiVec()
        #        
        #        for task_id_1 in data.get_task_ids():
        #            for task_id_2 in data.get_task_ids():
        #                all_block_vec.push_back(Pairii(task_id_1, task_id_2))
        #                
        #        
        #        # create mask-based normalizer
        #        normalizer_all = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, all_block_vec)        
        #        kernel_all = shogun_factory.create_empty_kernel(param)
        #        kernel_all.set_normalizer(normalizer_all)
        #                
        #        # append current kernel to CombinedKernel
        #        combined_kernel.append_kernel(kernel_all)
        #    
        #        # append features
        #        combined_features.append_feature_obj(base_features)

        
        ##################################################
        # add one kernel per similarity position
        
        
        # init seq handler 
        pseudoseqs = SequencesHandler()
        pseudoseq_length = pseudoseqs.seq_length


        for pos in range(pseudoseq_length):
            
            print "appending kernel for pos %i" % (pos)
        
            print "nums", data.task_vector_nums

    
            pos_block_vec = PairiiVec()
    
            # set similarity
            for task_name_lhs in data.get_task_names():
                for task_name_rhs in data.get_task_names():
                    
                    similarity = pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pos)
                    #print "computing similarity for tasks (%s, %s) = %i" % (task_name_lhs, task_name_rhs, similarity)
                    
                    if similarity == 1:                    
                        tmp_pair = Pairii(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs))
                        pos_block_vec.push_back(tmp_pair)

            print "creating normalizer"
            normalizer_pos = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, pos_block_vec)   

            print "creating empty kernel"
            kernel_pos = shogun_factory.create_empty_kernel(param)
            
            print "setting normalizer"
            kernel_pos.set_normalizer(normalizer_pos)
                
            print "appending kernel"
            # append current kernel to CombinedKernel
            combined_kernel.append_kernel(kernel_pos)
    
            print "appending features"
            # append features
            combined_features.append_feature_obj(base_features)
        
        
        print "done constructing combined kernel"
        
        ##################################################
        # init combined kernel
        
        combined_kernel.init(combined_features, combined_features)    
        
            

                
        print "subkernel weights:", combined_kernel.get_subkernel_weights()

        svm = None
                
        
        print "using MKL:", (param.transform >= 1.0)
        
        if param.transform >= 1.0:
            
            svm = MKLClassification()
            
            svm.set_mkl_norm(param.transform)
            #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto
        
            svm.set_C(param.cost, param.cost)
            
            svm.set_kernel(combined_kernel)
            svm.set_labels(lab)
            
                
        else:
            
            # create SVM (disable unsupported optimizations)
            combined_kernel.set_cache_size(500)
            
            svm = SVMLight(param.cost, combined_kernel, lab)


        # set up SVM
        num_threads = 8
        svm.io.enable_progress()
        #svm.io.set_loglevel(shogun.Classifier.MSG_INFO)
        svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG)
        
        svm.parallel.set_num_threads(num_threads)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)
        
        print "WARNING: custom epsilon set"
        svm.set_epsilon(0.05)    
        
        # normalize cost
        norm_c_pos = param.cost / float(len([l for l in data.labels if l==1]))
        norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1]))
        
        svm.set_C(norm_c_neg, norm_c_pos)
        
        
        # start training
        svm.train()
    
        
        # save additional info
        self.additional_information["svm_objective"] = svm.get_objective()
        self.additional_information["svm num sv"] = svm.get_num_support_vectors()
        self.additional_information["mkl weights post-training"] = combined_kernel.get_subkernel_weights()
        
        print self.additional_information 
        
        
        
        # wrap up predictors
        svms = {}
            
        # use a reference to the same svm several times
        for task_name in train_data.keys():
            svms[task_name] = (data.name_to_id(task_name), combined_kernel, svm)

        
        return svms
示例#30
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """


        assert(param.base_similarity >= 1)
        
        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)
        
        
        # create shogun data objects
        base_wdk = shogun_factory.create_kernel(data.examples, param)
        lab = shogun_factory.create_labels(data.labels)

        # set normalizer
        normalizer = MultitaskKernelNormalizer(data.task_vector_nums)
        
        # load data
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt")
        f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt")
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt")
        #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt")
        
        num_lines = int(f.readline().strip())
        task_distances = numpy.zeros((num_lines, num_lines))
        name_to_id = {}
        for (i, line) in enumerate(f):
            tokens = line.strip().split("\t")
            name = str(tokens[0])
            name_to_id[name] = i
            entry = numpy.array([v for (j,v) in enumerate(tokens) if j!=0])
            assert len(entry)==num_lines, "len_entry %i, num_lines %i" % (len(entry), num_lines)
            task_distances[i,:] = entry
            
        
        # cut relevant submatrix
        active_ids = [name_to_id[name] for name in data.get_task_names()] 
        tmp_distances = task_distances[active_ids, :]
        tmp_distances = tmp_distances[:, active_ids]
        print "distances ", tmp_distances.shape

        
        # normalize distances
        task_distances = task_distances / numpy.max(tmp_distances)
        
        
        similarities = numpy.zeros((data.get_num_tasks(), data.get_num_tasks()))
                                
        
        # convert distance to similarity
        for task_name_lhs in data.get_task_names():
            for task_name_rhs in data.get_task_names():
                
                
                # convert similarity with simple transformation
                similarity = param.base_similarity - task_distances[name_to_id[task_name_lhs], name_to_id[task_name_rhs]]
                normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity)
                
                # save for later
                similarities[data.name_to_id(task_name_lhs),data.name_to_id(task_name_rhs)] = similarity
                
                
        # set normalizer                
        base_wdk.set_normalizer(normalizer)
        base_wdk.init_normalizer()
        

        # set up svm
        svm = SVMLight(param.cost, base_wdk, lab)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)
        
        
        # normalize cost
        norm_c_pos = param.cost / float(len([l for l in data.labels if l==1]))
        norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1]))
        
        svm.set_C(norm_c_neg, norm_c_pos)
        
        
        # start training
        svm.train()


        # save additional information
        self.additional_information["svm objective"] = svm.get_objective()
        self.additional_information["num sv"] = svm.get_num_support_vectors()
        #self.additional_information["distances"] = distances
        self.additional_information["similarities"] = similarities


        # wrap up predictors
        svms = {}
        
        # use a reference to the same svm several times
        for task_name in data.get_task_names():
            
            task_num = data.name_to_id(task_name)
            
            # save svm and task_num
            svms[task_name] = (task_num, param, svm)

        return svms
示例#31
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)

        # create shogun data objects
        base_wdk = shogun_factory.create_kernel(data.examples, param)
        lab = shogun_factory.create_labels(data.labels)

        # fetch taxonomy from parameter object
        taxonomy = shogun_factory.create_taxonomy(param.taxonomy.data)

        # set normalizer
        normalizer = MultitaskKernelTreeNormalizer(data.task_vector_names,
                                                   data.task_vector_names,
                                                   taxonomy)

        ########################################################
        gammas = self.taxonomy_to_gammas(data, taxonomy)
        print "gammas before MKL:"
        print gammas
        ########################################################

        base_wdk.set_normalizer(normalizer)
        base_wdk.init_normalizer()

        svm = None

        num_subk = base_wdk.get_num_subkernels()

        print "num subkernels:", num_subk

        #print "subkernel weights:", base_wdk.get_subkernel_weights()

        self.additional_information["weights_before"] = [
            normalizer.get_beta(i) for i in range(num_subk)
        ]

        print "using MKL:", (param.transform >= 1.0)

        if param.transform >= 1.0:

            num_threads = 4

            svm = MKLClassification()

            svm.set_mkl_norm(param.transform)
            #svm.set_solver_type(ST_CPLEX) #GLPK) #DIRECT) #NEWTON)#ST_CPLEX)

            svm.set_kernel(base_wdk)
            svm.set_labels(lab)

            svm.parallel.set_num_threads(num_threads)
            svm.set_linadd_enabled(False)
            svm.set_batch_computation_enabled(False)

            if param.flags["normalize_cost"]:
                # normalize cost
                norm_c_pos = param.cost / float(
                    len([l for l in data.labels if l == 1]))
                norm_c_neg = param.cost / float(
                    len([l for l in data.labels if l == -1]))

                svm.set_C(norm_c_neg, norm_c_pos)
            else:
                svm.set_C(param.cost, param.cost)

            svm.train()

            #print "subkernel weights (after):", base_wdk.get_subkernel_weights()

        else:

            # create SVM (disable unsupported optimizations)
            svm = SVMLight(param.cost, base_wdk, lab)
            svm.set_linadd_enabled(False)
            svm.set_batch_computation_enabled(False)

            svm.train()

        print "svm objective:", svm.get_objective()

        self.additional_information["weights"] = [
            normalizer.get_beta(i) for i in range(num_subk)
        ]
        self.additional_information["gammas"] = self.taxonomy_to_gammas(
            data, taxonomy)

        print "debug weights:"
        print self.additional_information
        print ""

        # wrap up predictors
        svms = {}

        # use a reference to the same svm several times
        for task_id in train_data.keys():
            svms[task_id] = svm

        return svms
示例#32
0
def classifier_svmlight_linear_term_modular(fm_train_dna=traindna,fm_test_dna=testdna, \
                                                label_train_dna=label_traindna,degree=3, \
                                                C=10,epsilon=1e-5,num_threads=1):

    from shogun.Features import StringCharFeatures, Labels, DNA
    from shogun.Kernel import WeightedDegreeStringKernel
    from shogun.Classifier import SVMLight

    feats_train = StringCharFeatures(DNA)
    feats_train.set_features(fm_train_dna)
    feats_test = StringCharFeatures(DNA)
    feats_test.set_features(fm_test_dna)

    kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree)

    labels = Labels(label_train_dna)

    svm = SVMLight(C, kernel, labels)
    svm.set_qpsize(3)
    svm.set_linear_term(
        -numpy.array([1, 2, 3, 4, 5, 6, 7, 8, 7, 6], dtype=numpy.double))
    svm.set_epsilon(epsilon)
    svm.parallel.set_num_threads(num_threads)
    svm.train()

    kernel.init(feats_train, feats_test)
    out = svm.apply().get_labels()
    return out, kernel
	seed(17)
	traindata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1)
	testdata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1);

	trainlab=concatenate((-ones(num), ones(num)));
	testlab=concatenate((-ones(num), ones(num)));

	feats_train=RealFeatures(traindata_real);
	feats_test=RealFeatures(testdata_real);
	kernel=GaussianKernel(feats_train, feats_train, width);
	#kernel.io.set_loglevel(MSG_DEBUG)

	labels=Labels(trainlab);

	svm=SVMLight(C, kernel, labels)
	svm.train()
	#svm.io.set_loglevel(MSG_DEBUG)

	##################################################

	#print "labels:"
	#print pickle.dumps(labels)
	#
	#print "features"
	#print pickle.dumps(feats_train)
	#
	#print "kernel"
	#print pickle.dumps(kernel)
	#
	#print "svm"
示例#34
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """
        
          
        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)
        
        
        # create shogun data objects
        base_wdk = shogun_factory.create_kernel(data.examples, param)
        lab = shogun_factory.create_labels(data.labels)

        # set normalizer
        normalizer = MultitaskKernelNormalizer(data.task_vector_nums)

        ########################################################
        print "creating a kernel for each node:"
        ########################################################

        
        # init seq handler 
        task_kernel = SequencesHandlerRbf(1, param.base_similarity, data.get_task_names(), param.flags["wdk_rbf_on"])
        similarities = numpy.zeros((data.get_num_tasks(), data.get_num_tasks()))
        
        # convert distance to similarity
        for task_name_lhs in data.get_task_names():
            for task_name_rhs in data.get_task_names():
                
                
                 
                
                # convert similarity with simple transformation
                similarity = task_kernel.get_similarity(task_name_lhs, task_name_rhs)
                
                print similarity
                
                print "similarity (%s,%s)=%f" % (task_name_lhs, task_name_rhs, similarity)
                
                normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity)
                
                # save for later
                similarities[data.name_to_id(task_name_lhs),data.name_to_id(task_name_rhs)] = similarity
                
                
        # set normalizer                
        base_wdk.set_normalizer(normalizer)
        base_wdk.init_normalizer()
        

        # set up svm
        svm = SVMLight(param.cost, base_wdk, lab)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)
        
        
        # normalize cost
        norm_c_pos = param.cost / float(len([l for l in data.labels if l==1]))
        norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1]))
        
        svm.set_C(norm_c_neg, norm_c_pos)
        
        
        # start training
        svm.train()


        # save additional information
        self.additional_information["svm objective"] = svm.get_objective()
        self.additional_information["num sv"] = svm.get_num_support_vectors()
        #self.additional_information["distances"] = distances
        self.additional_information["similarities"] = similarities


        # wrap up predictors
        svms = {}
        
        # use a reference to the same svm several times
        for task_name in data.get_task_names():
            
            task_num = data.name_to_id(task_name)
            
            # save svm and task_num
            svms[task_name] = (task_num, param, svm)

        return svms
示例#35
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        import numpy
        numpy.random.seed(666)

        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=True)

        # create shogun label
        lab = shogun_factory.create_labels(data.labels)

        # assemble combined kernel
        combined_kernel = CombinedKernel()
        combined_kernel.io.set_loglevel(shogun.Kernel.MSG_DEBUG)
        # set kernel cache
        if param.flags.has_key("cache_size"):
            combined_kernel.set_cache_size(param.flags["cache_size"])

        # create features
        base_features = shogun_factory.create_features(data.examples, param)

        combined_features = CombinedFeatures()

        ########################################################
        print "creating a masked kernel for possible subset:"
        ########################################################

        power_set_tasks = power_set(data.get_task_ids())

        for active_task_ids in power_set_tasks:

            print "masking all entries other than:", active_task_ids

            # create mask-based normalizer
            normalizer = MultitaskKernelMaskNormalizer(data.task_vector_nums,
                                                       data.task_vector_nums,
                                                       active_task_ids)

            # normalize trace
            if param.flags.has_key(
                    "normalize_trace") and param.flags["normalize_trace"]:
                norm_factor = len(data.get_task_ids()) / len(active_task_ids)
                normalizer.set_normalization_constant(norm_factor)

            kernel = shogun_factory.create_empty_kernel(param)
            kernel.set_normalizer(normalizer)

            # append current kernel to CombinedKernel
            combined_kernel.append_kernel(kernel)

            # append features
            combined_features.append_feature_obj(base_features)

            print "------"

        combined_kernel.init(combined_features, combined_features)

        #combined_kernel.precompute_subkernels()

        self.additional_information[
            "weights before trainng"] = combined_kernel.get_subkernel_weights(
            )
        print "subkernel weights:", combined_kernel.get_subkernel_weights()

        svm = None

        print "using MKL:", (param.flags["mkl_q"] >= 1.0)

        if param.flags["mkl_q"] >= 1.0:

            svm = MKLClassification()

            svm.set_mkl_norm(param.flags["mkl_q"])

            # set interleaved optimization
            if param.flags.has_key("interleaved"):
                svm.set_interleaved_optimization_enabled(
                    param.flags["interleaved"])

            # set solver type
            if param.flags.has_key(
                    "solver_type") and param.flags["solver_type"]:
                if param.flags["solver_type"] == "ST_CPLEX":
                    svm.set_solver_type(ST_CPLEX)
                if param.flags["solver_type"] == "ST_DIRECT":
                    svm.set_solver_type(ST_DIRECT)
                if param.flags["solver_type"] == "ST_NEWTON":
                    svm.set_solver_type(ST_NEWTON)
                if param.flags["solver_type"] == "ST_GLPK":
                    svm.set_solver_type(ST_GLPK)

            svm.set_kernel(combined_kernel)
            svm.set_labels(lab)

        else:

            svm = SVMLight(param.cost, combined_kernel, lab)

        # optimization settings
        num_threads = 4
        svm.parallel.set_num_threads(num_threads)

        if param.flags.has_key("epsilon"):
            svm.set_epsilon(param.flags["epsilon"])

        # enable output
        svm.io.enable_progress()
        svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG)

        # disable unsupported optimizations (due to special normalizer)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)

        # set cost
        if param.flags["normalize_cost"]:

            norm_c_pos = param.cost / float(
                len([l for l in data.labels if l == 1]))
            norm_c_neg = param.cost / float(
                len([l for l in data.labels if l == -1]))
            svm.set_C(norm_c_neg, norm_c_pos)

        else:

            svm.set_C(param.cost, param.cost)

        svm.train()

        # prepare mapping
        weight_map = {}
        weights = combined_kernel.get_subkernel_weights()
        for (i, pset) in enumerate(power_set_tasks):
            print pset
            subset_str = str([data.id_to_name(task_idx) for task_idx in pset])
            weight_map[subset_str] = weights[i]

        # store additional info
        self.additional_information["svm objective"] = svm.get_objective()
        self.additional_information["weight_map"] = weight_map

        # wrap up predictors
        svms = {}

        # use a reference to the same svm several times
        for task_name in train_data.keys():
            svms[task_name] = (data.name_to_id(task_name),
                               len(power_set_tasks), combined_kernel, svm,
                               param)

        return svms
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        for task_id in train_data.keys():
            print "task_id:", task_id

        root = param.taxonomy.data

        grey_nodes = [root]

        # top-down processing of taxonomy

        for node in root.get_leaves():

            #####################################################
            #    train predictor
            #####################################################

            parent_node = node.get_nearest_neighbor()

            cost = param.cost

            (examples, labels) = self.get_data(parent_node, train_data)

            # create shogun data objects
            k_parent = shogun_factory_new.create_kernel(examples, param)
            lab_parent = shogun_factory_new.create_labels(labels)

            parent_svm = SVMLight(cost, k_parent, lab_parent)

            parent_svm.train()

            #####################################################
            #    train predictors
            #####################################################

            (examples, labels) = self.get_data(node, train_data)

            # create shogun data objects
            k = shogun_factory_new.create_kernel(examples, param)
            lab = shogun_factory_new.create_labels(labels)

            # regularize vs parent predictor

            weight = param.transform
            print "current edge_weight:", weight, " ,name:", node.name

            svm = DomainAdaptationSVM(cost, k, lab, parent_svm, weight)
            svm.train()

            # attach svm to node
            node.predictor = svm

        #####################################################
        #    Wrap things up
        #####################################################

        # wrap up predictors for later use
        predictors = {}

        for leaf in root.get_leaves():

            predictors[leaf.name] = leaf.predictor

            assert (leaf.predictor != None)

        sym_diff_keys = set(train_data.keys()).symmetric_difference(
            set(predictors.keys()))
        assert len(
            sym_diff_keys
        ) == 0, "symmetric difference between keys non-empty: " + str(
            sym_diff_keys)

        return predictors
from numpy import array, float64
import sys

# create dense matrices A,B,C
A = array([[1, 2, 3], [4, 0, 0], [0, 0, 0], [0, 5, 0], [0, 0, 6], [9, 9, 9]],
          dtype=float64)
B = array([1, 1, 1, -1, -1, -1], dtype=float64)

# ... of type Real, LongInt and Byte
feats_train = RealFeatures(A.transpose())
kernel = GaussianKernel(feats_train, feats_train, 1.0)
kernel.io.set_loglevel(MSG_DEBUG)

lab = Labels(B)

svm = SVMLight(1, kernel, lab)
svm.train()

helper.save("/tmp/awesome_svm", svm)
svm = helper.load("/tmp/awesome_svm")

svm.train()

#sys.exit(0)

run = expenv.Run.get(1010)
#run = expenv.Run.get(974)
dat = run.get_train_data()

print dat.keys()
d = dat["thaliana"]
示例#38
0
def serialization_string_kernels_modular(n_data, num_shifts, size):
    """
    serialize svm with string kernels
    """

    ##################################################
    # set up toy data and svm
    train_xt, train_lt = generate_random_data(n_data)
    test_xt, test_lt = generate_random_data(n_data)

    feats_train = construct_features(train_xt)
    feats_test = construct_features(test_xt)

    max_len = len(train_xt[0])
    kernel_wdk = WeightedDegreePositionStringKernel(size, 5)
    shifts_vector = numpy.ones(max_len, dtype=numpy.int32) * num_shifts
    kernel_wdk.set_shifts(shifts_vector)

    ########
    # set up spectrum
    use_sign = False
    kernel_spec_1 = WeightedCommWordStringKernel(size, use_sign)
    kernel_spec_2 = WeightedCommWordStringKernel(size, use_sign)

    ########
    # combined kernel
    kernel = CombinedKernel()
    kernel.append_kernel(kernel_wdk)
    kernel.append_kernel(kernel_spec_1)
    kernel.append_kernel(kernel_spec_2)

    # init kernel
    labels = BinaryLabels(train_lt)

    svm = SVMLight(1.0, kernel, labels)
    #svm.io.set_loglevel(MSG_DEBUG)
    svm.train(feats_train)

    ##################################################
    # serialize to file

    fn = "serialized_svm.bz2"
    #print("serializing SVM to file", fn)
    save(fn, svm)

    ##################################################
    # unserialize and sanity check

    #print("unserializing SVM")
    svm2 = load(fn)

    #print("comparing predictions")
    out = svm.apply(feats_test).get_labels()
    out2 = svm2.apply(feats_test).get_labels()

    # assert outputs are close
    for i in xrange(len(out)):
        assert abs(out[i] - out2[i] < 0.000001)

    #print("all checks passed.")

    return out, out2
labels = [i.label for i in d[0:subset_size]]

labels[2] = 1
labels[12] = 1
labels[15] = 1
labels[8] = 1
labels[19] = 1


feat = StringCharFeatures(DNA)
feat.set_features(examples)
wdk = WeightedDegreeStringKernel(feat, feat, 1)
lab = Labels(numpy.array(labels))


svm = SVMLight(1, wdk, lab)
svm.train()

svm.set_shrinking_enabled(False)

print "simple svm", svm.get_objective()

print "len(examples)", len(examples)

print "##############"


#print "##############"
#print "svm light"

#svm_light = SVMLight(1.0,wdk,lab)
def test_data():

    ##################################################################
    # select MSS
    ##################################################################

    mss = expenv.MultiSplitSet.get(379)

    ##################################################################
    # data
    ##################################################################

    # fetch data
    instance_set = mss.get_train_data(-1)

    # prepare data
    data = PreparedMultitaskData(instance_set, shuffle=True)

    # set parameters
    param = Options()
    param.kernel = "WeightedDegreeStringKernel"
    param.wdk_degree = 4
    param.cost = 1.0
    param.transform = 1.0
    param.id = 666
    param.freeze()

    ##################################################################
    # taxonomy
    ##################################################################

    taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data)

    support = numpy.linspace(0, 100, 4)

    distances = [[0, 1, 2, 2], [1, 0, 2, 2], [2, 2, 0, 1], [2, 2, 1, 0]]

    # create tree normalizer
    tree_normalizer = MultitaskKernelPlifNormalizer(support,
                                                    data.task_vector_names)

    task_names = data.get_task_names()

    FACTOR = 1.0

    # init gamma matrix
    gammas = numpy.zeros((data.get_num_tasks(), data.get_num_tasks()))

    for t1_name in task_names:
        for t2_name in task_names:

            similarity = taxonomy.compute_node_similarity(
                taxonomy.get_id(t1_name), taxonomy.get_id(t2_name))
            gammas[data.name_to_id(t1_name),
                   data.name_to_id(t2_name)] = similarity

    helper.save("/tmp/gammas", gammas)

    gammas = gammas * FACTOR

    cost = param.cost * numpy.sqrt(FACTOR)

    print gammas

    ##########
    # regular normalizer

    normalizer = MultitaskKernelNormalizer(data.task_vector_nums)

    for t1_name in task_names:
        for t2_name in task_names:

            similarity = gammas[data.name_to_id(t1_name),
                                data.name_to_id(t2_name)]
            normalizer.set_task_similarity(data.name_to_id(t1_name),
                                           data.name_to_id(t2_name),
                                           similarity)

    ##################################################################
    # Train SVMs
    ##################################################################

    # create shogun objects
    wdk_tree = shogun_factory.create_kernel(data.examples, param)
    lab = shogun_factory.create_labels(data.labels)

    wdk_tree.set_normalizer(tree_normalizer)
    wdk_tree.init_normalizer()

    print "--->", wdk_tree.get_normalizer().get_name()

    svm_tree = SVMLight(cost, wdk_tree, lab)
    svm_tree.set_linadd_enabled(False)
    svm_tree.set_batch_computation_enabled(False)

    svm_tree.train()

    del wdk_tree
    del tree_normalizer

    print "finished training tree-norm SVM:", svm_tree.get_objective()

    wdk = shogun_factory.create_kernel(data.examples, param)
    wdk.set_normalizer(normalizer)
    wdk.init_normalizer()

    print "--->", wdk.get_normalizer().get_name()

    svm = SVMLight(cost, wdk, lab)
    svm.set_linadd_enabled(False)
    svm.set_batch_computation_enabled(False)

    svm.train()

    print "finished training manually set SVM:", svm.get_objective()

    alphas_tree = svm_tree.get_alphas()
    alphas = svm.get_alphas()

    assert (len(alphas_tree) == len(alphas))

    for i in xrange(len(alphas)):
        assert (abs(alphas_tree[i] - alphas[i]) < 0.0001)

    print "success: all alphas are the same"
def classifier_svmlight_batch_linadd_modular(fm_train_dna, fm_test_dna,
		label_train_dna, degree, C, epsilon, num_threads):

	from shogun.Features import StringCharFeatures, BinaryLabels, DNA
	from shogun.Kernel import WeightedDegreeStringKernel, MSG_DEBUG
	try:
		from shogun.Classifier import SVMLight
	except ImportError:
		print('No support for SVMLight available.')
		return

	feats_train=StringCharFeatures(DNA)
	#feats_train.io.set_loglevel(MSG_DEBUG)
	feats_train.set_features(fm_train_dna)
	feats_test=StringCharFeatures(DNA)
	feats_test.set_features(fm_test_dna)
	degree=20

	kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)

	labels=BinaryLabels(label_train_dna)

	svm=SVMLight(C, kernel, labels)
	svm.set_epsilon(epsilon)
	svm.parallel.set_num_threads(num_threads)
	svm.train()

	kernel.init(feats_train, feats_test)

	#print('SVMLight Objective: %f num_sv: %d' % \)
	#	(svm.get_objective(), svm.get_num_support_vectors())
	svm.set_batch_computation_enabled(False)
	svm.set_linadd_enabled(False)
	svm.apply().get_labels()

	svm.set_batch_computation_enabled(True)
	labels = svm.apply().get_labels()
	return labels, svm
def serialization_svmlight_modular(num, dist, width, C):
	from shogun.IO import MSG_DEBUG
	from shogun.Features import RealFeatures, BinaryLabels, DNA, Alphabet
	from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel
	from shogun.Classifier import SVMLight
	from numpy import concatenate, ones
	from numpy.random import randn, seed

	import sys
	import types
	import random
	import bz2
	try:
		import cPickle as pickle
	except ImportError:
		import pickle as pickle	
	import inspect


	def save(filename, myobj):
		"""
		save object to file using pickle
		
		@param filename: name of destination file
		@type filename: str
		@param myobj: object to save (has to be pickleable)
		@type myobj: obj
		"""

		try:
			f = bz2.BZ2File(filename, 'wb')
		except IOError as details:
			sys.stderr.write('File ' + filename + ' cannot be written\n')
			sys.stderr.write(details)
			return

		pickle.dump(myobj, f, protocol=2)
		f.close()



	def load(filename):
		"""
		Load from filename using pickle
		
		@param filename: name of file to load from
		@type filename: str
		"""
		
		try:
			f = bz2.BZ2File(filename, 'rb')
		except IOError as details:
			sys.stderr.write('File ' + filename + ' cannot be read\n')
			sys.stderr.write(details)
			return

		myobj = pickle.load(f)
		f.close()
		return myobj


	##################################################

	seed(17)
	traindata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1)
	testdata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1);

	trainlab=concatenate((-ones(num), ones(num)));
	testlab=concatenate((-ones(num), ones(num)));

	feats_train=RealFeatures(traindata_real);
	feats_test=RealFeatures(testdata_real);
	kernel=GaussianKernel(feats_train, feats_train, width);
	#kernel.io.set_loglevel(MSG_DEBUG)

	labels=BinaryLabels(trainlab);

	svm=SVMLight(C, kernel, labels)
	svm.train()
	#svm.io.set_loglevel(MSG_DEBUG)

	##################################################

	#print("labels:")
	#print(pickle.dumps(labels))
	#
	#print("features")
	#print(pickle.dumps(feats_train))
	#
	#print("kernel")
	#print(pickle.dumps(kernel))
	#
	#print("svm")
	#print(pickle.dumps(svm))
	#
	#print("#################################")

	fn = "serialized_svm.bz2"
	#print("serializing SVM to file", fn)

	save(fn, svm)

	#print("#################################")

	#print("unserializing SVM")
	svm2 = load(fn)


	#print("#################################")
	#print("comparing training")

	svm2.train()

	#print("objective before serialization:", svm.get_objective())
	#print("objective after serialization:", svm2.get_objective())
	return svm, svm.get_objective(), svm2, svm2.get_objective()
def do_batch_linadd ():
	print 'SVMlight batch'

	from shogun.Features import StringCharFeatures, Labels, DNA
	from shogun.Kernel import WeightedDegreeStringKernel
	try:
		from shogun.Classifier import SVMLight
	except ImportError:
		print 'No support for SVMLight available.'
		return

	feats_train=StringCharFeatures(DNA)
	feats_train.set_features(fm_train_dna)
	feats_test=StringCharFeatures(DNA)
	feats_test.set_features(fm_test_dna)
	degree=20

	kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)

	C=1
	epsilon=1e-5
	num_threads=2
	labels=Labels(label_train_dna)

	svm=SVMLight(C, kernel, labels)
	svm.set_epsilon(epsilon)
	svm.parallel.set_num_threads(num_threads)
	svm.train()

	kernel.init(feats_train, feats_test)

	#print 'SVMLight Objective: %f num_sv: %d' % \
	#	(svm.get_objective(), svm.get_num_support_vectors())
	svm.set_batch_computation_enabled(False)
	svm.set_linadd_enabled(False)
	svm.classify().get_labels()

	svm.set_batch_computation_enabled(True)
	svm.classify().get_labels()

print 'SVMLight'

from shogun.Features import StringCharFeatures, Labels, DNA
from shogun.Kernel import WeightedDegreeStringKernel
from shogun.Classifier import SVMLight

feats_train=StringCharFeatures(DNA)
feats_train.set_features(fm_train_dna)
feats_test=StringCharFeatures(DNA)
feats_test.set_features(fm_test_dna)

kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)

C=10
epsilon=1e-5
num_threads=1
labels=Labels(label_train_dna)

svm=SVMLight(C, kernel, labels)
svm.set_qpsize(3)
svm.set_linear_term(-numpy.array([1,2,3,4,5,6,7,8,7,6], dtype=numpy.double));
svm.set_epsilon(epsilon)
svm.parallel.set_num_threads(num_threads)
svm.train()

kernel.init(feats_train, feats_test)
out = svm.classify().get_labels()

width=2.1

traindata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1)
testdata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1);

trainlab=concatenate((-ones(num), ones(num)));
testlab=concatenate((-ones(num), ones(num)));

feats_train=RealFeatures(traindata_real);
feats_test=RealFeatures(testdata_real);
kernel=GaussianKernel(feats_train, feats_train, width);
kernel.io.set_loglevel(MSG_DEBUG)

labels=Labels(trainlab);

svm=SVMLight(2, kernel, labels)
svm.train()
svm.io.set_loglevel(MSG_DEBUG)

##################################################

print "labels:"
print labels.to_string()

print "features"
print feats_train.to_string()

print "kernel"
print kernel.to_string()

print "svm"
def test_data():
    
    ##################################################################
    # select MSS
    ##################################################################
    
    mss = expenv.MultiSplitSet.get(379)
    
    
    
    ##################################################################
    # data
    ##################################################################
    
    # fetch data
    instance_set = mss.get_train_data(-1)
    
    # prepare data
    data = PreparedMultitaskData(instance_set, shuffle=True)
    
    # set parameters
    param = Options()
    param.kernel = "WeightedDegreeStringKernel"
    param.wdk_degree = 4
    param.cost = 1.0
    param.transform = 1.0
    param.id = 666
    param.freeze()
    
    
    
    
    ##################################################################
    # taxonomy
    ##################################################################
    
    
    taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data)
    
    
    support = numpy.linspace(0, 100, 4)
    
    
    distances = [[0, 1, 2, 2], [1, 0, 2, 2], [2, 2, 0, 1], [2, 2, 1, 0]]
    
    # create tree normalizer 
    tree_normalizer = MultitaskKernelPlifNormalizer(support, data.task_vector_names)
    
    
    
    
    task_names = data.get_task_names()
    
    
    FACTOR = 1.0
    
    
    # init gamma matrix
    gammas = numpy.zeros((data.get_num_tasks(), data.get_num_tasks()))
    
    for t1_name in task_names:
        for t2_name in task_names:
            
            similarity = taxonomy.compute_node_similarity(taxonomy.get_id(t1_name), taxonomy.get_id(t2_name))        
            gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] = similarity
    
    helper.save("/tmp/gammas", gammas)
    
    
    gammas = gammas * FACTOR
    
    cost = param.cost * numpy.sqrt(FACTOR) 
    
    print gammas
    
    
    ##########
    # regular normalizer
    
    normalizer = MultitaskKernelNormalizer(data.task_vector_nums)
    
    for t1_name in task_names:
        for t2_name in task_names:
                    
            similarity = gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)]
            normalizer.set_task_similarity(data.name_to_id(t1_name), data.name_to_id(t2_name), similarity)
    
                
    ##################################################################
    # Train SVMs
    ##################################################################
    
    # create shogun objects
    wdk_tree = shogun_factory.create_kernel(data.examples, param)
    lab = shogun_factory.create_labels(data.labels)
    
    wdk_tree.set_normalizer(tree_normalizer)
    wdk_tree.init_normalizer()
    
    print "--->",wdk_tree.get_normalizer().get_name()
    
    svm_tree = SVMLight(cost, wdk_tree, lab)
    svm_tree.set_linadd_enabled(False)
    svm_tree.set_batch_computation_enabled(False)
    
    svm_tree.train()
    
    del wdk_tree
    del tree_normalizer
    
    print "finished training tree-norm SVM:", svm_tree.get_objective()
    
    
    wdk = shogun_factory.create_kernel(data.examples, param)
    wdk.set_normalizer(normalizer)
    wdk.init_normalizer()
    
    print "--->",wdk.get_normalizer().get_name()
    
    svm = SVMLight(cost, wdk, lab)
    svm.set_linadd_enabled(False)
    svm.set_batch_computation_enabled(False)
    
    svm.train()
    
    print "finished training manually set SVM:", svm.get_objective()
    
    
    alphas_tree = svm_tree.get_alphas()
    alphas = svm.get_alphas()
    
    assert(len(alphas_tree)==len(alphas))
    
    for i in xrange(len(alphas)):
        assert(abs(alphas_tree[i] - alphas[i]) < 0.0001)
        
    print "success: all alphas are the same"
示例#47
0
examples_presvm = [i.example for i in d[0:subset_size]]
labels_presvm = [i.label for i in d[0:subset_size]]

labels_presvm[2] = 1
labels_presvm[12] = 1
labels_presvm[15] = 1
labels_presvm[8] = 1
labels_presvm[19] = 1

feat_presvm = StringCharFeatures(DNA)
feat_presvm.set_features(examples_presvm)
wdk_presvm = WeightedDegreeStringKernel(feat_presvm, feat_presvm, 1)
lab_presvm = Labels(numpy.array(labels_presvm))

presvm = SVMLight(1, wdk_presvm, lab_presvm)
presvm.train()

presvm2 = LibSVM(1, wdk_presvm, lab_presvm)
presvm2.train()

print "svmlight", presvm.get_objective()
print "libsvm", presvm2.get_objective()

assert (abs(presvm.get_objective() - presvm2.get_objective()) <= 0.001)

print "simple svm", presvm.get_objective()

print "len(examples_presvm)", len(examples_presvm)

print "##############"
示例#48
0
def classifier_svmlight_batch_linadd_modular(fm_train_dna, fm_test_dna,
                                             label_train_dna, degree, C,
                                             epsilon, num_threads):

    from shogun.Features import StringCharFeatures, Labels, DNA
    from shogun.Kernel import WeightedDegreeStringKernel, MSG_DEBUG
    try:
        from shogun.Classifier import SVMLight
    except ImportError:
        print 'No support for SVMLight available.'
        return

    feats_train = StringCharFeatures(DNA)
    #feats_train.io.set_loglevel(MSG_DEBUG)
    feats_train.set_features(fm_train_dna)
    feats_test = StringCharFeatures(DNA)
    feats_test.set_features(fm_test_dna)
    degree = 20

    kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree)

    labels = Labels(label_train_dna)

    svm = SVMLight(C, kernel, labels)
    svm.set_epsilon(epsilon)
    svm.parallel.set_num_threads(num_threads)
    svm.train()

    kernel.init(feats_train, feats_test)

    #print 'SVMLight Objective: %f num_sv: %d' % \
    #	(svm.get_objective(), svm.get_num_support_vectors())
    svm.set_batch_computation_enabled(False)
    svm.set_linadd_enabled(False)
    svm.apply().get_labels()

    svm.set_batch_computation_enabled(True)
    labels = svm.apply().get_labels()
    return labels, svm
def serialization_svmlight_modular(num, dist, width, C):
    from shogun.IO import MSG_DEBUG
    from shogun.Features import RealFeatures, BinaryLabels, DNA, Alphabet
    from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel
    from shogun.Classifier import SVMLight
    from numpy import concatenate, ones
    from numpy.random import randn, seed

    import sys
    import types
    import random
    import bz2
    try:
        import cPickle as pickle
    except ImportError:
        import pickle as pickle
    import inspect

    def save(filename, myobj):
        """
        save object to file using pickle

        @param filename: name of destination file
        @type filename: str
        @param myobj: object to save (has to be pickleable)
        @type myobj: obj
        """

        try:
            f = bz2.BZ2File(filename, 'wb')
        except IOError as details:
            sys.stderr.write('File ' + filename + ' cannot be written\n')
            sys.stderr.write(details)
            return

        pickle.dump(myobj, f, protocol=2)
        f.close()

    def load(filename):
        """
        Load from filename using pickle

        @param filename: name of file to load from
        @type filename: str
        """

        try:
            f = bz2.BZ2File(filename, 'rb')
        except IOError as details:
            sys.stderr.write('File ' + filename + ' cannot be read\n')
            sys.stderr.write(details)
            return

        myobj = pickle.load(f)
        f.close()
        return myobj

    ##################################################
    # set up toy data and svm

    traindata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist),
                                 axis=1)
    testdata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist),
                                axis=1)

    trainlab = concatenate((-ones(num), ones(num)))
    testlab = concatenate((-ones(num), ones(num)))

    feats_train = RealFeatures(traindata_real)
    feats_test = RealFeatures(testdata_real)
    kernel = GaussianKernel(feats_train, feats_train, width)
    #kernel.io.set_loglevel(MSG_DEBUG)

    labels = BinaryLabels(trainlab)

    svm = SVMLight(C, kernel, labels)
    svm.train()
    #svm.io.set_loglevel(MSG_DEBUG)

    ##################################################
    # serialize to file

    fn = "serialized_svm.bz2"
    print("serializing SVM to file", fn)
    save(fn, svm)

    ##################################################
    # unserialize and sanity check

    print("unserializing SVM")
    svm2 = load(fn)

    print("comparing objectives")

    svm2.train()

    print("objective before serialization:", svm.get_objective())
    print("objective after serialization:", svm2.get_objective())

    print("comparing predictions")

    out = svm.apply(feats_test).get_labels()
    out2 = svm2.apply(feats_test).get_labels()

    # assert outputs are close
    for i in xrange(len(out)):
        assert abs(out[i] - out2[i] < 0.000001)

    print("all checks passed.")

    return True
            
##################################################################
# Train SVMs
##################################################################

# create shogun objects
wdk_tree = shogun_factory.create_kernel(data.examples, param)
lab = shogun_factory.create_labels(data.labels)

wdk_tree.set_normalizer(tree_normalizer)
wdk_tree.init_normalizer()

print "--->",wdk_tree.get_normalizer().get_name()

svm_tree = SVMLight(cost, wdk_tree, lab)
svm_tree.set_linadd_enabled(False)
svm_tree.set_batch_computation_enabled(False)

svm_tree.train()

del wdk_tree
del tree_normalizer

print "finished training tree-norm SVM:", svm_tree.get_objective()


wdk = shogun_factory.create_kernel(data.examples, param)
wdk.set_normalizer(normalizer)
wdk.init_normalizer()
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)

        # create shogun data objects
        base_wdk = shogun_factory.create_kernel(data.examples, param)
        lab = shogun_factory.create_labels(data.labels)

        # support
        support = numpy.linspace(0, 1, 5)

        # set normalizer
        normalizer = MultitaskKernelPlifNormalizer(support,
                                                   data.task_vector_nums)

        # fetch taxonomy from parameter object
        taxonomy = param.taxonomy.data

        taxonomy.plot()
        import os
        os.system("evince demo.png &")

        # compute distances
        distances = numpy.zeros((data.get_num_tasks(), data.get_num_tasks()))

        for (i, task_name_lhs) in enumerate(data.get_task_names()):
            for (j, task_name_rhs) in enumerate(data.get_task_names()):

                distances[i, j] = task_similarities.compute_hop_distance(
                    taxonomy, task_name_lhs, task_name_rhs)

        # normalize distances
        distances = distances / numpy.max(distances)

        # set distances
        for (i, task_name_lhs) in enumerate(data.get_task_names()):
            for (j, task_name_rhs) in enumerate(data.get_task_names()):

                normalizer.set_task_distance(i, j, distances[i, j])

        # assign normalizer
        base_wdk.set_normalizer(normalizer)
        base_wdk.init_normalizer()

        svm = None

        debug_weights = {}

        num_subk = base_wdk.get_num_subkernels()

        print "num subkernels:", num_subk

        #print "subkernel weights:", base_wdk.get_subkernel_weights()

        debug_weights["before"] = [
            normalizer.get_beta(i) for i in range(num_subk)
        ]

        print "using MKL:", (param.transform >= 1.0)

        if param.transform >= 1.0:

            num_threads = 4

            svm = MKLClassification()

            svm.set_mkl_norm(param.transform)
            #svm.set_solver_type(ST_CPLEX) #GLPK) #DIRECT) #NEWTON)#ST_CPLEX)

            svm.set_C(param.cost, param.cost)

            svm.set_kernel(base_wdk)
            svm.set_labels(lab)

            svm.parallel.set_num_threads(num_threads)
            svm.set_linadd_enabled(False)
            svm.set_batch_computation_enabled(False)

            svm.train()

            #print "subkernel weights (after):", base_wdk.get_subkernel_weights()

        else:

            # create SVM (disable unsupported optimizations)
            svm = SVMLight(param.cost, base_wdk, lab)
            svm.set_linadd_enabled(False)
            svm.set_batch_computation_enabled(False)

            svm.train()

        print "svm objective:", svm.get_objective()

        debug_weights["after"] = [
            normalizer.get_beta(i) for i in range(num_subk)
        ]

        # debugging output
        print "debug weights (before/after):"
        print debug_weights["before"]
        print debug_weights["after"]
        print ""

        # wrap up predictors
        svms = {}

        # use a reference to the same svm several times
        for task_name in train_data.keys():
            svms[task_name] = (svm, data.name_to_id(task_name))

        return svms
def serialization_svmlight_modular(num, dist, width, C):
    from shogun.IO import MSG_DEBUG
    from shogun.Features import RealFeatures, BinaryLabels, DNA, Alphabet
    from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel
    from shogun.Classifier import SVMLight
    from numpy import concatenate, ones
    from numpy.random import randn, seed

    import sys
    import types
    import random
    import bz2

    try:
        import cPickle as pickle
    except ImportError:
        import pickle as pickle
    import inspect

    def save(filename, myobj):
        """
        save object to file using pickle

        @param filename: name of destination file
        @type filename: str
        @param myobj: object to save (has to be pickleable)
        @type myobj: obj
        """

        try:
            f = bz2.BZ2File(filename, "wb")
        except IOError as details:
            sys.stderr.write("File " + filename + " cannot be written\n")
            sys.stderr.write(details)
            return

        pickle.dump(myobj, f, protocol=2)
        f.close()

    def load(filename):
        """
        Load from filename using pickle

        @param filename: name of file to load from
        @type filename: str
        """

        try:
            f = bz2.BZ2File(filename, "rb")
        except IOError as details:
            sys.stderr.write("File " + filename + " cannot be read\n")
            sys.stderr.write(details)
            return

        myobj = pickle.load(f)
        f.close()
        return myobj

    ##################################################
    # set up toy data and svm

    traindata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist), axis=1)
    testdata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist), axis=1)

    trainlab = concatenate((-ones(num), ones(num)))
    testlab = concatenate((-ones(num), ones(num)))

    feats_train = RealFeatures(traindata_real)
    feats_test = RealFeatures(testdata_real)
    kernel = GaussianKernel(feats_train, feats_train, width)
    # kernel.io.set_loglevel(MSG_DEBUG)

    labels = BinaryLabels(trainlab)

    svm = SVMLight(C, kernel, labels)
    svm.train()
    # svm.io.set_loglevel(MSG_DEBUG)

    ##################################################
    # serialize to file

    fn = "serialized_svm.bz2"
    # print("serializing SVM to file", fn)
    save(fn, svm)

    ##################################################
    # unserialize and sanity check

    # print("unserializing SVM")
    svm2 = load(fn)

    # print("comparing objectives")

    svm2.train()

    # print("objective before serialization:", svm.get_objective())
    # print("objective after serialization:", svm2.get_objective())

    # print("comparing predictions")

    out = svm.apply(feats_test).get_labels()
    out2 = svm2.apply(feats_test).get_labels()

    # assert outputs are close
    for i in xrange(len(out)):
        assert abs(out[i] - out2[i] < 0.000001)

    # print("all checks passed.")

    return True
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        # dict to save additional information for later analysis
        self.additional_information = {}

        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=True)

        # create shogun label
        lab = shogun_factory.create_labels(data.labels)

        ########################################################
        print "creating a kernel for each node:"
        ########################################################

        # assemble combined kernel

        combined_kernel = CombinedKernel()

        combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO)

        base_features = shogun_factory.create_features(data.examples, param)

        combined_features = CombinedFeatures()

        ##################################################
        # intra-domain blocks (dirac kernel)

        intra_block_vec = PairiiVec()

        for task_id in data.get_task_ids():
            intra_block_vec.push_back(Pairii(task_id, task_id))

        # create mask-based normalizer
        normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums,
                                                       intra_block_vec)
        kernel = shogun_factory.create_empty_kernel(param)
        kernel.set_normalizer(normalizer)

        # append current kernel to CombinedKernel
        combined_kernel.append_kernel(kernel)

        # append features
        combined_features.append_feature_obj(base_features)

        print "------"

        ##################################################
        # all blocks (full kernel matrix)

        all_block_vec = PairiiVec()

        for task_id_1 in data.get_task_ids():
            for task_id_2 in data.get_task_ids():
                all_block_vec.push_back(Pairii(task_id_1, task_id_2))

        # create mask-based normalizer
        normalizer_all = MultitaskKernelMaskPairNormalizer(
            data.task_vector_nums, all_block_vec)
        kernel_all = shogun_factory.create_empty_kernel(param)
        kernel_all.set_normalizer(normalizer_all)

        # append current kernel to CombinedKernel
        combined_kernel.append_kernel(kernel_all)

        # append features
        combined_features.append_feature_obj(base_features)

        ##################################################
        # hack

        #        hack_block_vec = PairiiVec()
        #
        #        for task_id_1 in data.get_task_ids():
        #            for task_id_2 in data.get_task_ids():
        #                hack_block_vec.push_back(Pairii(task_id_1, task_id_2))
        #
        #        hack_block_vec.push_back(Pairii(data.name_to_id("B_2705"), data.name_to_id("B_4001")))
        #        other_group = ["B_0702", "B_1501", "B_5801"]
        #        for task_id_1 in other_group:
        #            for task_id_2 in other_group:
        #                hack_block_vec.push_back(Pairii(data.name_to_id(task_id_1), data.name_to_id(task_id_2)))
        #
        #
        #
        #        # create mask-based normalizer
        #        normalizer_hack = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, hack_block_vec)
        #        kernel_hack = shogun_factory.create_empty_kernel(param)
        #        kernel_hack.set_normalizer(normalizer_hack)
        #
        #        # append current kernel to CombinedKernel
        #        combined_kernel.append_kernel(kernel_hack)
        #
        #        # append features
        #        combined_features.append_feature_obj(base_features)

        ##################################################
        # init combined kernel

        combined_kernel.init(combined_features, combined_features)

        #combined_kernel.precompute_subkernels()
        self.additional_information[
            "mkl weights before"] = combined_kernel.get_subkernel_weights()

        print "subkernel weights:", combined_kernel.get_subkernel_weights()

        svm = None

        print "using MKL:", (param.flags["mkl_q"] >= 1.0)

        if param.flags["mkl_q"] >= 1.0:

            svm = MKLClassification()

            svm.set_mkl_norm(param.flags["mkl_q"])
            svm.set_kernel(combined_kernel)
            svm.set_labels(lab)

        else:

            # create SVM (disable unsupported optimizations)
            combined_kernel.set_cache_size(500)
            svm = SVMLight(param.cost, combined_kernel, lab)

        num_threads = 8
        svm.io.enable_progress()
        svm.io.set_loglevel(shogun.Classifier.MSG_INFO)

        svm.parallel.set_num_threads(num_threads)
        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)

        svm.set_epsilon(0.03)

        # set cost
        if param.flags["normalize_cost"]:

            norm_c_pos = param.cost / float(
                len([l for l in data.labels if l == 1]))
            norm_c_neg = param.cost / float(
                len([l for l in data.labels if l == -1]))
            svm.set_C(norm_c_neg, norm_c_pos)

        else:

            svm.set_C(param.cost, param.cost)

        svm.train()

        print "subkernel weights (after):", combined_kernel.get_subkernel_weights(
        )

        ########################################################
        print "svm objective:"
        print svm.get_objective()

        self.additional_information["svm_objective"] = svm.get_objective()
        self.additional_information[
            "svm num sv"] = svm.get_num_support_vectors()
        self.additional_information[
            "mkl weights post-training"] = combined_kernel.get_subkernel_weights(
            )

        ########################################################

        # wrap up predictors
        svms = {}

        # use a reference to the same svm several times
        for task_name in train_data.keys():
            svms[task_name] = (data.name_to_id(task_name), combined_kernel,
                               svm, param)

        return svms
示例#54
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """
        
                
        # merge data sets
        data = PreparedMultitaskData(train_data, shuffle=False)

                
        # create shogun data objects
        base_wdk = shogun_factory.create_kernel(data.examples, param)
        kernel_matrix = base_wdk.get_kernel_matrix()
        lab = shogun_factory.create_labels(data.labels)
        

        # fetch taxonomy from parameter object
        taxonomy = param.taxonomy.data

        # create name to leaf map
        nodes = taxonomy.get_all_nodes()


        ########################################################
        print "creating a kernel for each node:"
        ########################################################


        # assemble combined kernel
        from shogun.Kernel import CombinedKernel, CustomKernel
        
        combined_kernel = CombinedKernel()
        
        # indicator to which task each example belongs
        task_vector = data.task_vector_names
        
        for node in nodes:
            
            print "creating kernel for ", node.name
            
            # fetch sub-tree
            leaf_names = [leaf.name for leaf in node.get_leaves()]
            
            print "masking all entries other than:", leaf_names
            
            # init matrix
            kernel_matrix_node = numpy.zeros(kernel_matrix.shape)
            
            # fill matrix for node
            for (i, task_lhs) in enumerate(task_vector):
                for (j, task_rhs) in enumerate(task_vector):
                    
                    # only copy values, if both tasks are present in subtree
                    if task_lhs in leaf_names and task_rhs in leaf_names:
                        kernel_matrix_node[i,j] = kernel_matrix[i,j]
                    
            # create custom kernel
            kernel_node = CustomKernel()
            kernel_node.set_full_kernel_matrix_from_full(kernel_matrix_node)
            
            
            # append custom kernel to CombinedKernel
            combined_kernel.append_kernel(kernel_node)                
            
            print "------"
        

        print "subkernel weights:", combined_kernel.get_subkernel_weights()

        svm = None
                
        
        print "using MKL:", (param.transform >= 1.0)
        
        if param.transform >= 1.0:
        
        
            num_threads = 4

            
            svm = MKLClassification()
            
            svm.set_mkl_norm(param.transform)
            svm.set_solver_type(ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX)
        
            svm.set_C(param.cost, param.cost)
            
            svm.set_kernel(combined_kernel)
            svm.set_labels(lab)
            
            svm.parallel.set_num_threads(num_threads)
            #svm.set_linadd_enabled(False)
            #svm.set_batch_computation_enabled(False)
            
            svm.train()
        
            print "subkernel weights (after):", combined_kernel.get_subkernel_weights()    
            
        else:
            
            # create SVM (disable unsupported optimizations)
            svm = SVMLight(param.cost, combined_kernel, lab)
            svm.set_linadd_enabled(False)
            svm.set_batch_computation_enabled(False)
            
            svm.train()


        ########################################################
        print "svm objective:"
        print svm.get_objective()
        ########################################################
        
        
        # wrap up predictors
        svms = {}
            
        # use a reference to the same svm several times
        for task_id in train_data.keys():
            svms[task_id] = svm


        return svms
示例#55
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """



        for task_id in train_data.keys():
            print "task_id:", task_id
            
        
        root = param.taxonomy.data
        

        grey_nodes = [root]
        

        # top-down processing of taxonomy
        
        for node in root.get_leaves():


            #####################################################
            #    train predictor 
            #####################################################
 
            parent_node = node.get_nearest_neighbor()

            cost = param.cost
   
            (examples, labels) = self.get_data(parent_node, train_data)

            # create shogun data objects
            k_parent = shogun_factory_new.create_kernel(examples, param)
            lab_parent = shogun_factory_new.create_labels(labels)

            parent_svm = SVMLight(cost, k_parent, lab_parent)
            
            parent_svm.train()
    


            #####################################################
            #    train predictors    
            #####################################################
            

            (examples, labels) = self.get_data(node, train_data)

            # create shogun data objects
            k = shogun_factory_new.create_kernel(examples, param)
            lab = shogun_factory_new.create_labels(labels)

               
            # regularize vs parent predictor
            
            weight = param.transform
            print "current edge_weight:", weight, " ,name:", node.name
            
            svm = DomainAdaptationSVM(cost, k, lab, parent_svm, weight)
            svm.train()

                                    
            # attach svm to node
            node.predictor = svm
 


        #####################################################
        #    Wrap things up    
        #####################################################
 
        # wrap up predictors for later use
        predictors = {}

        for leaf in root.get_leaves():
            
            predictors[leaf.name] = leaf.predictor
            
            assert(leaf.predictor!=None)
            
        sym_diff_keys = set(train_data.keys()).symmetric_difference(set(predictors.keys()))
        assert len(sym_diff_keys)==0, "symmetric difference between keys non-empty: " + str(sym_diff_keys)  


        return predictors