示例#1
0
def distribution_hmm(fm_cube, N, M, pseudo, order, gap, reverse, num_examples):
    from shogun import StringWordFeatures, StringCharFeatures, CUBE
    from shogun import HMM, BW_NORMAL

    charfeat = StringCharFeatures(CUBE)
    charfeat.set_features(fm_cube)
    feats = StringWordFeatures(charfeat.get_alphabet())
    feats.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    hmm = HMM(feats, N, M, pseudo)
    hmm.train()
    hmm.baum_welch_viterbi_train(BW_NORMAL)

    num_examples = feats.get_num_vectors()
    num_param = hmm.get_num_model_parameters()
    for i in range(num_examples):
        for j in range(num_param):
            hmm.get_log_derivative(j, i)

    best_path = 0
    best_path_state = 0
    for i in range(num_examples):
        best_path += hmm.best_path(i)
        for j in range(N):
            best_path_state += hmm.get_best_path_state(i, j)

    lik_example = hmm.get_log_likelihood()
    lik_sample = hmm.get_log_likelihood_sample()

    return lik_example, lik_sample, hmm
def kernel_histogram_word_string(fm_train_dna=traindat,
                                 fm_test_dna=testdat,
                                 label_train_dna=label_traindat,
                                 order=3,
                                 ppseudo_count=1,
                                 npseudo_count=1):

    from shogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels
    from shogun import HistogramWordStringKernel, AvgDiagKernelNormalizer
    from shogun import PluginEstimate  #, MSG_DEBUG

    charfeat = StringCharFeatures(DNA)
    #charfeat.io.set_loglevel(MSG_DEBUG)
    charfeat.set_features(fm_train_dna)
    feats_train = StringWordFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, 0, False)

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_test_dna)
    feats_test = StringWordFeatures(charfeat.get_alphabet())
    feats_test.obtain_from_char(charfeat, order - 1, order, 0, False)

    pie = PluginEstimate(ppseudo_count, npseudo_count)
    labels = BinaryLabels(label_train_dna)
    pie.set_labels(labels)
    pie.set_features(feats_train)
    pie.train()

    kernel = HistogramWordStringKernel(feats_train, feats_train, pie)
    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    pie.set_features(feats_test)
    pie.apply().get_labels()
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
示例#3
0
def distribution_linearhmm(fm_dna=traindna, order=3, gap=0, reverse=False):

    from shogun import StringWordFeatures, StringCharFeatures, DNA
    from shogun import LinearHMM

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_dna)
    feats = StringWordFeatures(charfeat.get_alphabet())
    feats.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    hmm = LinearHMM(feats)
    hmm.train()

    hmm.get_transition_probs()

    num_examples = feats.get_num_vectors()
    num_param = hmm.get_num_model_parameters()
    for i in range(num_examples):
        for j in range(num_param):
            hmm.get_log_derivative(j, i)

    out_likelihood = hmm.get_log_likelihood()
    out_sample = hmm.get_log_likelihood_sample()

    return hmm, out_likelihood, out_sample
示例#4
0
def classifier_ssk(fm_train_dna=traindat,
                   fm_test_dna=testdat,
                   label_train_dna=label_traindat,
                   C=1,
                   maxlen=1,
                   decay=1):
    from shogun import StringCharFeatures, BinaryLabels
    from shogun import LibSVM, SubsequenceStringKernel, DNA
    from shogun import ErrorRateMeasure

    feats_train = StringCharFeatures(fm_train_dna, DNA)
    feats_test = StringCharFeatures(fm_test_dna, DNA)
    labels = BinaryLabels(label_train_dna)
    kernel = SubsequenceStringKernel(feats_train, feats_train, maxlen, decay)

    svm = LibSVM(C, kernel, labels)
    svm.train()

    out = svm.apply(feats_train)
    evaluator = ErrorRateMeasure()
    trainerr = evaluator.evaluate(out, labels)
    # print(trainerr)

    kernel.init(feats_train, feats_test)
    predicted_labels = svm.apply(feats_test).get_labels()
    # print predicted_labels

    return predicted_labels
示例#5
0
	def get_predictions_from_seqdict(self, seqdic, site):
		""" we need to generate a huge test features object
			containing all locations found in each seqdict-sequence
			and each location (this is necessary to efficiently
			(==fast,low memory) compute the splice outputs
		"""

		seqlen=self.window_right+self.window_left+2

		for s in seqdic:
			position_list=DynamicIntArray()

			sequence=s.seq
			positions=s.preds[site].positions
			for j in xrange(len(positions)):
				i=positions[j] - self.offset -self.window_left
				position_list.append_element(i)

			t=StringCharFeatures([sequence], DNA)
			t.obtain_by_position_list(seqlen, position_list)
			self.wd_kernel.init(self.traindat, t)

			self.wd_kernel.io.enable_progress()
			l=self.svm.apply().get_values()
			self.wd_kernel.cleanup()
			sys.stdout.write("\n...done...\n")

			num=len(s.preds[site].positions)
			scores= num * [0]
			for j in xrange(num):
				scores[j]=l[j]
			s.preds[site].set_scores(scores)
示例#6
0
def kernel_combined (fm_train_real=traindat,fm_test_real=testdat,fm_train_dna=traindna,fm_test_dna=testdna ):
	from shogun import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel
	from shogun import RealFeatures, StringCharFeatures, CombinedFeatures, DNA

	kernel=CombinedKernel()
	feats_train=CombinedFeatures()
	feats_test=CombinedFeatures()

	subkfeats_train=RealFeatures(fm_train_real)
	subkfeats_test=RealFeatures(fm_test_real)
	subkernel=GaussianKernel(10, 1.1)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	subkfeats_train=StringCharFeatures(fm_train_dna, DNA)
	subkfeats_test=StringCharFeatures(fm_test_dna, DNA)
	degree=3
	subkernel=FixedDegreeStringKernel(10, degree)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	subkfeats_train=StringCharFeatures(fm_train_dna, DNA)
	subkfeats_test=StringCharFeatures(fm_test_dna, DNA)
	subkernel=LocalAlignmentStringKernel(10)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	kernel.init(feats_train, feats_train)
	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def distribution_linearhmm (fm_dna=traindna,order=3,gap=0,reverse=False):

	from shogun import StringWordFeatures, StringCharFeatures, DNA
	from shogun import LinearHMM

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_dna)
	feats=StringWordFeatures(charfeat.get_alphabet())
	feats.obtain_from_char(charfeat, order-1, order, gap, reverse)

	hmm=LinearHMM(feats)
	hmm.train()

	hmm.get_transition_probs()

	num_examples=feats.get_num_vectors()
	num_param=hmm.get_num_model_parameters()
	for i in range(num_examples):
		for j in range(num_param):
			hmm.get_log_derivative(j, i)

	out_likelihood = hmm.get_log_likelihood()
	out_sample = hmm.get_log_likelihood_sample()

	return hmm,out_likelihood ,out_sample
def kernel_salzberg_word_string (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,
order=3,gap=0,reverse=False):
	from shogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels
	from shogun import SalzbergWordStringKernel
	from shogun import PluginEstimate

	charfeat=StringCharFeatures(fm_train_dna, DNA)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	charfeat=StringCharFeatures(fm_test_dna, DNA)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	pie=PluginEstimate()
	labels=BinaryLabels(label_train_dna)
	pie.set_labels(labels)
	pie.set_features(feats_train)
	pie.train()

	kernel=SalzbergWordStringKernel(feats_train, feats_train, pie, labels)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	pie.set_features(feats_test)
	pie.apply().get_labels()
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
示例#9
0
def kernel_comm_word_string(fm_train_dna=traindat,
                            fm_test_dna=testdat,
                            order=3,
                            gap=0,
                            reverse=False,
                            use_sign=False):

    from shogun import CommWordStringKernel
    from shogun import StringWordFeatures, StringCharFeatures, DNA
    from shogun import SortWordString

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_train_dna)
    feats_train = StringWordFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    preproc = SortWordString()
    preproc.init(feats_train)
    feats_train.add_preprocessor(preproc)
    feats_train.apply_preprocessor()

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_test_dna)
    feats_test = StringWordFeatures(charfeat.get_alphabet())
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    feats_test.add_preprocessor(preproc)
    feats_test.apply_preprocessor()

    kernel = CommWordStringKernel(feats_train, feats_train, use_sign)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
示例#10
0
def kernel_weighted_degree_string (fm_train_dna=traindat,fm_test_dna=testdat,degree=20):
	from shogun import StringCharFeatures, DNA
	from shogun import WeightedDegreeStringKernel, MSG_DEBUG

	feats_train=StringCharFeatures(fm_train_dna, DNA)
	#feats_train.io.set_loglevel(MSG_DEBUG)
	feats_test=StringCharFeatures(fm_test_dna, DNA)

	kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)

	from numpy import arange,double
	weights=arange(1,degree+1,dtype=double)[::-1]/ \
		sum(arange(1,degree+1,dtype=double))
	kernel.set_wd_weights(weights)
	#from numpy import ones,float64,int32
	#kernel.set_position_weights(ones(len(fm_train_dna[0]), dtype=float64))

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

    #this is how to serializate the kernel
	#import pickle
	#pickle.dump(kernel, file('tmp/kernel_obj.dump','w'), protocol=2)
	#k=pickle.load(file('tmp/kernel_obj.dump','r'))


	return km_train, km_test, kernel
def kernel_weighted_comm_word_string (fm_train_dna=traindat,fm_test_dna=testdat,order=3,gap=0,reverse=True ):
	from shogun import WeightedCommWordStringKernel
	from shogun import StringWordFeatures, StringCharFeatures, DNA
	from shogun import SortWordString

	charfeat=StringCharFeatures(fm_train_dna, DNA)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(fm_test_dna, DNA)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	use_sign=False
	kernel=WeightedCommWordStringKernel(feats_train, feats_train, use_sign)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
示例#12
0
def classifier_domainadaptationsvm (fm_train_dna=traindna,fm_test_dna=testdna, \
                                                label_train_dna=label_traindna, \
                                               label_test_dna=label_testdna,fm_train_dna2=traindna2,fm_test_dna2=testdna2, \
                                               label_train_dna2=label_traindna2,label_test_dna2=label_testdna2,C=1,degree=3):

    feats_train = StringCharFeatures(fm_train_dna, DNA)
    feats_test = StringCharFeatures(fm_test_dna, DNA)
    kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree)
    labels = BinaryLabels(label_train_dna)
    svm = SVMLight(C, kernel, labels)
    svm.train()
    #svm.io.set_loglevel(MSG_DEBUG)

    #####################################

    #print("obtaining DA SVM from previously trained SVM")

    feats_train2 = StringCharFeatures(fm_train_dna, DNA)
    feats_test2 = StringCharFeatures(fm_test_dna, DNA)
    kernel2 = WeightedDegreeStringKernel(feats_train, feats_train, degree)
    labels2 = BinaryLabels(label_train_dna)

    # we regularize against the previously obtained solution
    dasvm = DomainAdaptationSVM(C, kernel2, labels2, svm, 1.0)
    dasvm.train()

    out = dasvm.apply_binary(feats_test2)

    return out  #,dasvm TODO
示例#13
0
    def init_sensor(self, kernel, svs):
        f = StringCharFeatures(svs, DNA)

        kname = kernel['name']
        if  kname == 'spectrum':
            wf = StringWordFeatures(f.get_alphabet())
            wf.obtain_from_char(f, kernel['order'] - 1, kernel['order'], 0, False)

            pre = SortWordString()
            pre.init(wf)
            wf.add_preprocessor(pre)
            wf.apply_preprocessor()
            f = wf

            k = CommWordStringKernel(0, False)
            k.set_use_dict_diagonal_optimization(kernel['order'] < 8)
            self.preproc = pre

        elif kname == 'wdshift':
                k = WeightedDegreePositionStringKernel(0, kernel['order'])
                k.set_normalizer(IdentityKernelNormalizer())
                k.set_shifts(kernel['shift'] *
                        numpy.ones(f.get_max_vector_length(), dtype=numpy.int32))
                k.set_position_weights(1.0 / f.get_max_vector_length() *
                        numpy.ones(f.get_max_vector_length(), dtype=numpy.float64))
        else:
            raise "Currently, only wdshift and spectrum kernels supported"

        self.kernel = k
        self.train_features = f

        return (self.kernel, self.train_features)
示例#14
0
def distance_hammingword (fm_train_dna=traindna,fm_test_dna=testdna,
		fm_test_real=testdat,order=3,gap=0,reverse=False,use_sign=False):

	from shogun import StringCharFeatures, StringWordFeatures, DNA
	from shogun import SortWordString
	from shogun import HammingWordDistance

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	distance=HammingWordDistance(feats_train, feats_train, use_sign)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return distance,dm_train,dm_test
示例#15
0
def kernel_match_word_string(fm_train_dna=traindat,
                             fm_test_dna=testdat,
                             degree=3,
                             scale=1.4,
                             size_cache=10,
                             order=3,
                             gap=0,
                             reverse=False):
    from shogun import MatchWordStringKernel, AvgDiagKernelNormalizer
    from shogun import StringWordFeatures, StringCharFeatures, DNA

    charfeat = StringCharFeatures(fm_train_dna, DNA)
    feats_train = StringWordFeatures(DNA)
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    charfeat = StringCharFeatures(fm_test_dna, DNA)
    feats_test = StringWordFeatures(DNA)
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    kernel = MatchWordStringKernel(size_cache, degree)
    kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
    kernel.init(feats_train, feats_train)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
示例#16
0
def kernel_poly_match_word_string(fm_train_dna=traindat,
                                  fm_test_dna=testdat,
                                  degree=2,
                                  inhomogene=True,
                                  order=3,
                                  gap=0,
                                  reverse=False):
    from shogun import PolyMatchWordStringKernel
    from shogun import StringWordFeatures, StringCharFeatures, DNA

    charfeat = StringCharFeatures(fm_train_dna, DNA)
    feats_train = StringWordFeatures(DNA)
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    charfeat = StringCharFeatures(fm_test_dna, DNA)
    feats_test = StringWordFeatures(DNA)
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    kernel = PolyMatchWordStringKernel(feats_train, feats_train, degree,
                                       inhomogene)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
示例#17
0
def distribution_hmm(fm_cube, N, M, pseudo, order, gap, reverse, num_examples):
	from shogun import StringWordFeatures, StringCharFeatures, CUBE
	from shogun import HMM, BW_NORMAL

	charfeat=StringCharFeatures(CUBE)
	charfeat.set_features(fm_cube)
	feats=StringWordFeatures(charfeat.get_alphabet())
	feats.obtain_from_char(charfeat, order-1, order, gap, reverse)

	hmm=HMM(feats, N, M, pseudo)
	hmm.train()
	hmm.baum_welch_viterbi_train(BW_NORMAL)

	num_examples=feats.get_num_vectors()
	num_param=hmm.get_num_model_parameters()
	for i in range(num_examples):
		for j in range(num_param):
			hmm.get_log_derivative(j, i)

	best_path=0
	best_path_state=0
	for i in range(num_examples):
		best_path+=hmm.best_path(i)
		for j in range(N):
			best_path_state+=hmm.get_best_path_state(i, j)

	lik_example = hmm.get_log_likelihood()
	lik_sample = hmm.get_log_likelihood_sample()

	return lik_example, lik_sample, hmm
示例#18
0
def kernel_linear_string(fm_train_dna=traindat, fm_test_dna=testdat):
    from shogun import StringCharFeatures, DNA
    from shogun import LinearStringKernel

    feats_train = StringCharFeatures(fm_train_dna, DNA)
    feats_test = StringCharFeatures(fm_test_dna, DNA)

    kernel = LinearStringKernel(feats_train, feats_train)
    km_train = kernel.get_kernel_matrix()

    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
示例#19
0
 def predict(self, data):
     from shogun import SNP, RAWBYTE
     from shogun import StringCharFeatures
     if self.isSNP:
         feats_test = StringCharFeatures(data, SNP)
     else:
         feats_test = StringCharFeatures(data, RAWBYTE)
     # 将测试string数据转化为中间量
     self.kernel.init(self.feats_train, feats_test)
     feats_test = self.kernel.get_kernel_matrix()
     result = self.clf.predict(feats_test.T)
     print ' '.join(map(str, result))
     return result
def get_spectrum_features(data, order=3, gap=0, reverse=True):
    """
    create feature object used by spectrum kernel
    """

    charfeat = StringCharFeatures(data, DNA)
    feat = StringWordFeatures(charfeat.get_alphabet())
    feat.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    preproc = SortWordString()
    preproc.init(feat)
    feat.add_preprocessor(preproc)
    feat.apply_preprocessor()

    return feat
示例#21
0
def classifier_svmlight(fm_train_dna=traindat,
                        fm_test_dna=testdat,
                        label_train_dna=label_traindat,
                        C=1.2,
                        epsilon=1e-5,
                        num_threads=1):
    from shogun import StringCharFeatures, BinaryLabels, DNA
    from shogun import WeightedDegreeStringKernel
    try:
        from shogun import SVMLight
    except ImportError:
        print('No support for SVMLight available.')
        return

    feats_train = StringCharFeatures(DNA)
    feats_train.set_features(fm_train_dna)
    feats_test = StringCharFeatures(DNA)
    feats_test.set_features(fm_test_dna)
    degree = 20

    kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree)

    labels = BinaryLabels(label_train_dna)

    svm = SVMLight(C, kernel, labels)
    svm.set_epsilon(epsilon)
    svm.parallel.set_num_threads(num_threads)
    svm.train()

    kernel.init(feats_train, feats_test)
    svm.apply().get_labels()
    return kernel
def kernel_fixed_degree_string (fm_train_dna=traindat, fm_test_dna=testdat,degree=3):
	from shogun import StringCharFeatures, DNA
	from shogun import FixedDegreeStringKernel

	feats_train=StringCharFeatures(fm_train_dna, DNA)
	feats_test=StringCharFeatures(fm_test_dna, DNA)

	kernel=FixedDegreeStringKernel(feats_train, feats_train, degree)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

	return km_train,km_test,kernel
def kernel_histogram_word_string (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,order=3,ppseudo_count=1,npseudo_count=1):

	from shogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels
	from shogun import HistogramWordStringKernel, AvgDiagKernelNormalizer
	from shogun import PluginEstimate#, MSG_DEBUG

	charfeat=StringCharFeatures(DNA)
	#charfeat.io.set_loglevel(MSG_DEBUG)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, 0, False)

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, 0, False)

	pie=PluginEstimate(ppseudo_count,npseudo_count)
	labels=BinaryLabels(label_train_dna)
	pie.set_labels(labels)
	pie.set_features(feats_train)
	pie.train()

	kernel=HistogramWordStringKernel(feats_train, feats_train, pie)
	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	pie.set_features(feats_test)
	pie.apply().get_labels()
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def classifier_svmlight_linear_term (fm_train_dna=traindna,fm_test_dna=testdna, \
                                                label_train_dna=label_traindna,degree=3, \
                                                C=10,epsilon=1e-5,num_threads=1):

    from shogun import StringCharFeatures, BinaryLabels, DNA
    from shogun import WeightedDegreeStringKernel
    try:
    	from shogun import SVMLight
    except ImportError:
    	print("SVMLight is not available")
    	exit(0)

    feats_train=StringCharFeatures(DNA)
    feats_train.set_features(fm_train_dna)
    feats_test=StringCharFeatures(DNA)
    feats_test.set_features(fm_test_dna)

    kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)

    labels=BinaryLabels(label_train_dna)

    svm=SVMLight(C, kernel, labels)
    svm.set_qpsize(3)
    svm.set_linear_term(-numpy.array([1,2,3,4,5,6,7,8,7,6], dtype=numpy.double));
    svm.set_epsilon(epsilon)
    svm.parallel.set_num_threads(num_threads)
    svm.train()

    kernel.init(feats_train, feats_test)
    out = svm.apply().get_labels()
    return out,kernel
示例#25
0
def distribution_ppwm (fm_dna=traindna, order=3):
	from shogun import StringByteFeatures, StringCharFeatures, DNA
	from shogun import PositionalPWM

	from numpy import array,e,log,exp

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_dna)
	feats=StringByteFeatures(charfeat.get_alphabet())
	feats.obtain_from_char(charfeat, order-1, order, 0, False)

	L=20
	k=3
	sigma = 1;
	mu = 4

	ppwm=PositionalPWM()
	ppwm.set_sigma(sigma)
	ppwm.set_mean(mu)
	pwm=array([[0.0, 0.5, 0.1, 1.0],
               [0.0, 0.5, 0.5, 0.0],
               [1.0, 0.0, 0.4, 0.0],
               [0.0, 0.0, 0.0, 0.0]]);
	pwm=array([[0.01,0.09,0.1],[0.09,0.01,0.1],[0.85,0.4,0.1],[0.05,0.5,0.7]])



	ppwm.set_pwm(log(pwm))
	#print(ppwm.get_pwm())
	ppwm.compute_w(L)
	w=ppwm.get_w()
	#print(w)
	#from pylab import *
	#figure(1)
	#pcolor(exp(w))
	#pcolor(w)
	#colorbar()

	#figure(2)
	ppwm.compute_scoring(1)
	u=ppwm.get_scoring(0)
	#pcolor(exp(u))
	#show()

	#ppwm=PositionalPWM(feats)
	#ppwm.train()

	#out_likelihood = histo.get_log_likelihood()
	#out_sample = histo.get_log_likelihood_sample()
	return w,u
def kernel_comm_ulong_string (fm_train_dna=traindat,fm_test_dna=testdat, order=3, gap=0, reverse = False):

	from shogun import CommUlongStringKernel
	from shogun import StringUlongFeatures, StringCharFeatures, DNA
	from shogun import SortUlongString

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringUlongFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortUlongString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()


	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringUlongFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	use_sign=False

	kernel=CommUlongStringKernel(feats_train, feats_train, use_sign)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def distance_canberraword (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False):
	from shogun import StringCharFeatures, StringWordFeatures, DNA
	from shogun import SortWordString
	from shogun import CanberraWordDistance

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	distance=CanberraWordDistance(feats_train, feats_train)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return distance,dm_train,dm_test
def create_distance_matrix(full_essays, ids):
    string_features = StringCharFeatures(full_essays, RAWBYTE)
    sk = SubsequenceStringKernel(string_features, string_features, 3, 0.5)
    sk_matrix = sk.get_kernel_matrix()
    sk_df = pd.DataFrame(sk_matrix)
    sk_df.columns = ['id_' + str(i) for i in ids]
    return (sk_df)
示例#29
0
def features_hasheddocdot(strings):
    from shogun import StringCharFeatures, RAWBYTE
    from shogun import HashedDocDotFeatures
    from shogun import NGramTokenizer
    from numpy import array

    #create string features
    f = StringCharFeatures(strings, RAWBYTE)

    #set the number of bits of the target dimension
    #means a dim of size 2^5=32
    num_bits = 5

    #create the ngram tokenizer of size 8 to parse the strings
    tokenizer = NGramTokenizer(8)

    #normalize results
    normalize = True

    #create HashedDocDot features
    hddf = HashedDocDotFeatures(num_bits, f, tokenizer, normalize)

    #should expect 32
    #print('Feature space dimensionality is', hddf.get_dim_feature_space())

    #print('Self dot product of string 0', hddf.dot(0, hddf, 0))

    return hddf
示例#30
0
def distribution_ppwm(fm_dna=traindna, order=3):
    from shogun import StringByteFeatures, StringCharFeatures, DNA
    from shogun import PositionalPWM

    from numpy import array, e, log, exp

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_dna)
    feats = StringByteFeatures(charfeat.get_alphabet())
    feats.obtain_from_char(charfeat, order - 1, order, 0, False)

    L = 20
    k = 3
    sigma = 1
    mu = 4

    ppwm = PositionalPWM()
    ppwm.set_sigma(sigma)
    ppwm.set_mean(mu)
    pwm = array([[0.0, 0.5, 0.1, 1.0], [0.0, 0.5, 0.5, 0.0],
                 [1.0, 0.0, 0.4, 0.0], [0.0, 0.0, 0.0, 0.0]])
    pwm = array([[0.01, 0.09, 0.1], [0.09, 0.01, 0.1], [0.85, 0.4, 0.1],
                 [0.05, 0.5, 0.7]])

    ppwm.set_pwm(log(pwm))
    #print(ppwm.get_pwm())
    ppwm.compute_w(L)
    w = ppwm.get_w()
    #print(w)
    #from pylab import *
    #figure(1)
    #pcolor(exp(w))
    #pcolor(w)
    #colorbar()

    #figure(2)
    ppwm.compute_scoring(1)
    u = ppwm.get_scoring(0)
    #pcolor(exp(u))
    #show()

    #ppwm=PositionalPWM(feats)
    #ppwm.train()

    #out_likelihood = histo.get_log_likelihood()
    #out_sample = histo.get_log_likelihood_sample()
    return w, u
示例#31
0
 def score(self, data, label):
     from shogun import SNP, RAWBYTE
     from shogun import StringCharFeatures
     from sklearn.metrics import accuracy_score
     from sklearn.metrics import f1_score
     if self.isSNP:
         feats_test = StringCharFeatures(data, SNP)
     else:
         feats_test = StringCharFeatures(data, RAWBYTE)
     # 将测试string数据转化为中间量
     self.kernel.init(self.feats_train, feats_test)
     feats_test = self.kernel.get_kernel_matrix()
     retult = self.clf.predict(feats_test.T)
     acc = accuracy_score(label, retult)
     f1 = f1_score(label, retult, average='macro')
     print '正确率是:' + str(acc), 'F1得分是:' + str(f1)
     return acc, f1
示例#32
0
def kernel_poly_match_string(fm_train_dna=traindat,
                             fm_test_dna=testdat,
                             degree=3,
                             inhomogene=False):
    from shogun import PolyMatchStringKernel
    from shogun import StringCharFeatures, DNA

    feats_train = StringCharFeatures(fm_train_dna, DNA)
    feats_test = StringCharFeatures(fm_train_dna, DNA)

    kernel = PolyMatchStringKernel(feats_train, feats_train, degree,
                                   inhomogene)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
示例#33
0
def kernel_distantsegments(fm_train_dna=traindat,
                           fm_test_dna=testdat,
                           delta=5,
                           theta=5):
    from shogun import StringCharFeatures, DNA
    from shogun import DistantSegmentsKernel

    feats_train = StringCharFeatures(fm_train_dna, DNA)
    feats_test = StringCharFeatures(fm_test_dna, DNA)

    kernel = DistantSegmentsKernel(feats_train, feats_train, 10, delta, theta)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()

    return km_train, km_test, kernel
示例#34
0
def features_string_file(directory, fname):
    from shogun import StringCharFeatures, RAWBYTE
    from shogun import CSVFile

    # load features from directory
    f = StringCharFeatures(RAWBYTE)
    f.load_from_directory(directory)

    #and output several stats
    #print("max string length", f.get_max_vector_length())
    #print("number of strings", f.get_num_vectors())
    #print("length of first string", f.get_vector_length(0))
    #print("str[0,0:3]", f.get_feature(0,0), f.get_feature(0,1), f.get_feature(0,2))
    #print("len(str[0])", f.get_vector_length(0))
    #print("str[0]", f.get_feature_vector(0))

    #or load features from file (one string per line)
    fil = CSVFile(fname)
    f.load(fil)
    #print(f.get_features())

    #or load fasta file
    #f.load_fasta('fasta.fa')
    #print(f.get_features())
    return f.get_features(), f
示例#35
0
def kernel_simple_locality_improved_string (fm_train_dna=traindat,fm_test_dna=testdat,
	length=5,inner_degree=5,outer_degree=1 ):

	from shogun import StringCharFeatures, DNA
	from shogun import SimpleLocalityImprovedStringKernel, MSG_DEBUG

	feats_train=StringCharFeatures(fm_train_dna, DNA)
	#feats_train.io.set_loglevel(MSG_DEBUG)
	feats_test=StringCharFeatures(fm_test_dna, DNA)


	kernel=SimpleLocalityImprovedStringKernel(
		feats_train, feats_train, length, inner_degree, outer_degree)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
示例#36
0
def kernel_ssk_string(fm_train_dna=traindat,
                      fm_test_dna=testdat,
                      maxlen=1,
                      decay=1):
    from shogun import SubsequenceStringKernel
    from shogun import StringCharFeatures, DNA

    feats_train = StringCharFeatures(fm_train_dna, DNA)
    feats_test = StringCharFeatures(fm_test_dna, DNA)

    kernel = SubsequenceStringKernel(feats_train, feats_train, maxlen, decay)

    km_train = kernel.get_kernel_matrix()
    # print(km_train)
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    # print(km_test)
    return km_train, km_test, kernel
def kernel_locality_improved_string(fm_train_dna=traindat,
                                    fm_test_dna=testdat,
                                    length=5,
                                    inner_degree=5,
                                    outer_degree=7):

    from shogun import StringCharFeatures, DNA
    from shogun import LocalityImprovedStringKernel

    feats_train = StringCharFeatures(fm_train_dna, DNA)
    feats_test = StringCharFeatures(fm_test_dna, DNA)

    kernel = LocalityImprovedStringKernel(feats_train, feats_train, length,
                                          inner_degree, outer_degree)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
示例#38
0
	def get_predictions(self, sequence, positions):

		seqlen=self.window_right+self.window_left+2
		num=len(positions)

		testdat = []

		for j in xrange(num):
			i=positions[j] - self.offset ;
			s=sequence[i-self.window_left:i+self.window_right+2]
			testdat.append(s)

		t=StringCharFeatures(DNA)
		t.set_string_features(testdat)

		self.wd_kernel.init(self.traindat, t)
		l=self.svm.classify().get_labels()
		sys.stderr.write("\n...done...\n")
		return l
示例#39
0
    def get_predictions(self, sequence, positions):

        seqlen = self.window_right + self.window_left + 2
        num = len(positions)

        testdat = []

        for j in xrange(num):
            i = positions[j] - self.offset
            s = sequence[i - self.window_left:i + self.window_right + 2]
            testdat.append(s)

        t = StringCharFeatures(DNA)
        t.set_string_features(testdat)

        self.wd_kernel.init(self.traindat, t)
        l = self.svm.classify().get_labels()
        sys.stderr.write("\n...done...\n")
        return l
示例#40
0
def features_string_char (strings):
	from shogun import StringCharFeatures, RAWBYTE
	from numpy import array

	#create string features
	f=StringCharFeatures(strings, RAWBYTE)

	#and output several stats
	#print("max string length", f.get_max_vector_length())
	#print("number of strings", f.get_num_vectors())
	#print("length of first string", f.get_vector_length(0))
	#print("string[5]", ''.join(f.get_feature_vector(5)))
	#print("strings", f.get_features())

	#replace string 0
	f.set_feature_vector(array(['t','e','s','t']), 0)

	#print("strings", f.get_features())
	return f.get_features(), f
示例#41
0
def kernel_fisher (fm_train_dna=traindat, fm_test_dna=testdat,
		label_train_dna=label_traindat,
		N=1,M=4,pseudo=1e-1,order=1,gap=0,reverse=False,
		kargs=[1,False,True]):

	from shogun import StringCharFeatures, StringWordFeatures, FKFeatures, DNA
	from shogun import PolyKernel
	from shogun import HMM, BW_NORMAL#, MSG_DEBUG

	# train HMM for positive class
	charfeat=StringCharFeatures(fm_hmm_pos, DNA)
	#charfeat.io.set_loglevel(MSG_DEBUG)
	hmm_pos_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	pos=HMM(hmm_pos_train, N, M, pseudo)
	pos.baum_welch_viterbi_train(BW_NORMAL)

	# train HMM for negative class
	charfeat=StringCharFeatures(fm_hmm_neg, DNA)
	hmm_neg_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	neg=HMM(hmm_neg_train, N, M, pseudo)
	neg.baum_welch_viterbi_train(BW_NORMAL)

	# Kernel training data
	charfeat=StringCharFeatures(fm_train_dna, DNA)
	wordfeats_train=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	# Kernel testing data
	charfeat=StringCharFeatures(fm_test_dna, DNA)
	wordfeats_test=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	# get kernel on training data
	pos.set_observations(wordfeats_train)
	neg.set_observations(wordfeats_train)
	feats_train=FKFeatures(10, pos, neg)
	feats_train.set_opt_a(-1) #estimate prior
	kernel=PolyKernel(feats_train, feats_train, *kargs)
	km_train=kernel.get_kernel_matrix()

	# get kernel on testing data
	pos_clone=HMM(pos)
	neg_clone=HMM(neg)
	pos_clone.set_observations(wordfeats_test)
	neg_clone.set_observations(wordfeats_test)
	feats_test=FKFeatures(10, pos_clone, neg_clone)
	feats_test.set_a(feats_train.get_a()) #use prior from training data
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
示例#42
0
	def get_predictions(self, sequence, positions):

		seqlen=self.window_right+self.window_left+2
		num=len(positions)

		position_list=DynamicIntArray()

		for j in xrange(num):
			i=positions[j] - self.offset - self.window_left
			position_list.append_element(i)

		t=StringCharFeatures([sequence], DNA)
		t.obtain_by_position_list(seqlen, position_list)
		self.wd_kernel.init(self.traindat, t)
		del t

		self.wd_kernel.io.enable_progress()
		l=self.svm.apply().get_values()
		self.wd_kernel.cleanup()
		sys.stdout.write("\n...done...\n")
		return l
def features_string_file (directory, fname):
	from shogun import StringCharFeatures, RAWBYTE
	from shogun import CSVFile

	# load features from directory
	f=StringCharFeatures(RAWBYTE)
	f.load_from_directory(directory)

	#and output several stats
	#print("max string length", f.get_max_vector_length())
	#print("number of strings", f.get_num_vectors())
	#print("length of first string", f.get_vector_length(0))
	#print("str[0,0:3]", f.get_feature(0,0), f.get_feature(0,1), f.get_feature(0,2))
	#print("len(str[0])", f.get_vector_length(0))
	#print("str[0]", f.get_feature_vector(0))

	#or load features from file (one string per line)
	fil=CSVFile(fname)
	f.load(fil)
	#print(f.get_features())

	#or load fasta file
	#f.load_fasta('fasta.fa')
	#print(f.get_features())
	return f.get_features(), f
示例#44
0
def kernel_top (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,pseudo=1e-1,
	order=1,gap=0,reverse=False,kargs=[1, False, True]):
	from shogun import StringCharFeatures, StringWordFeatures, TOPFeatures, DNA
	from shogun import PolyKernel
	from shogun import HMM, BW_NORMAL

	N=1 # toy HMM with 1 state
	M=4 # 4 observations -> DNA


	# train HMM for positive class
	charfeat=StringCharFeatures(fm_hmm_pos, DNA)
	hmm_pos_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	pos=HMM(hmm_pos_train, N, M, pseudo)
	pos.baum_welch_viterbi_train(BW_NORMAL)

	# train HMM for negative class
	charfeat=StringCharFeatures(fm_hmm_neg, DNA)
	hmm_neg_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	neg=HMM(hmm_neg_train, N, M, pseudo)
	neg.baum_welch_viterbi_train(BW_NORMAL)

	# Kernel training data
	charfeat=StringCharFeatures(fm_train_dna, DNA)
	wordfeats_train=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	# Kernel testing data
	charfeat=StringCharFeatures(fm_test_dna, DNA)
	wordfeats_test=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	# get kernel on training data
	pos.set_observations(wordfeats_train)
	neg.set_observations(wordfeats_train)
	feats_train=TOPFeatures(10, pos, neg, False, False)
	kernel=PolyKernel(feats_train, feats_train, *kargs)
	km_train=kernel.get_kernel_matrix()

	# get kernel on testing data
	pos_clone=HMM(pos)
	neg_clone=HMM(neg)
	pos_clone.set_observations(wordfeats_test)
	neg_clone.set_observations(wordfeats_test)
	feats_test=TOPFeatures(10, pos_clone, neg_clone, False, False)
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
示例#45
0
def classifier_svmlight (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,C=1.2,epsilon=1e-5,num_threads=1):
	from shogun import StringCharFeatures, BinaryLabels, DNA
	from shogun import WeightedDegreeStringKernel
	try:
		from shogun import SVMLight
	except ImportError:
		print('No support for SVMLight available.')
		return

	feats_train=StringCharFeatures(DNA)
	feats_train.set_features(fm_train_dna)
	feats_test=StringCharFeatures(DNA)
	feats_test.set_features(fm_test_dna)
	degree=20

	kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)

	labels=BinaryLabels(label_train_dna)

	svm=SVMLight(C, kernel, labels)
	svm.set_epsilon(epsilon)
	svm.parallel.set_num_threads(num_threads)
	svm.train()

	kernel.init(feats_train, feats_test)
	svm.apply().get_labels()
	return kernel
def distribution_histogram (fm_dna=traindna,order=3,gap=0,reverse=False):
	from shogun import StringWordFeatures, StringCharFeatures, DNA
	from shogun import Histogram

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_dna)
	feats=StringWordFeatures(charfeat.get_alphabet())
	feats.obtain_from_char(charfeat, order-1, order, gap, reverse)

	histo=Histogram(feats)
	histo.train()

	histo.get_histogram()

	num_examples=feats.get_num_vectors()
	num_param=histo.get_num_model_parameters()
	#for i in xrange(num_examples):
	#	for j in xrange(num_param):
	#		histo.get_log_derivative(j, i)

	out_likelihood = histo.get_log_likelihood()
	out_sample = histo.get_log_likelihood_sample()
	return histo,out_sample,out_likelihood
def distance_manhattenword (train_fname=traindna,test_fname=testdna,order=3,gap=0,reverse=False):
	from shogun import StringCharFeatures, StringWordFeatures, DNA
	from shogun import SortWordString, ManhattanWordDistance, CSVFile

	charfeat=StringCharFeatures(CSVFile(train_fname), DNA)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(CSVFile(test_fname), DNA)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	distance=ManhattanWordDistance(feats_train, feats_train)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return dm_train,dm_test
示例#48
0
    def get_test_features(self, seq, window):
        start = self.window[0] - window[0]
        end = len(seq) - window[1] + self.window[2]
        size = self.window[2] - self.window[0] + 1
        seq = seq[start:end]
        seq = seq.replace("N", "A").replace("R", "A").replace("M", "A")
        f = StringCharFeatures([seq], DNA)

        if self.preproc:
            wf = StringWordFeatures(f.get_alphabet())
            o = self.train_features.get_order()
            wf.obtain_from_char(f, 0, o, 0, False)
            f = wf
            f.obtain_by_sliding_window(size, 1, o - 1)
        else:
            f.obtain_by_sliding_window(size, 1)

        return f
def classifier_svmlight_batch_linadd (fm_train_dna, fm_test_dna,
		label_train_dna, degree, C, epsilon, num_threads):

	from shogun import StringCharFeatures, BinaryLabels, DNA
	from shogun import WeightedDegreeStringKernel, MSG_DEBUG
	try:
		from shogun import SVMLight
	except ImportError:
		print('No support for SVMLight available.')
		return

	feats_train=StringCharFeatures(DNA)
	#feats_train.io.set_loglevel(MSG_DEBUG)
	feats_train.set_features(fm_train_dna)
	feats_test=StringCharFeatures(DNA)
	feats_test.set_features(fm_test_dna)
	degree=20

	kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)

	labels=BinaryLabels(label_train_dna)

	svm=SVMLight(C, kernel, labels)
	svm.set_epsilon(epsilon)
	svm.parallel.set_num_threads(num_threads)
	svm.train()

	kernel.init(feats_train, feats_test)

	#print('SVMLight Objective: %f num_sv: %d' % \)
	#	(svm.get_objective(), svm.get_num_support_vectors())
	svm.set_batch_computation_enabled(False)
	svm.set_linadd_enabled(False)
	svm.apply().get_labels()

	svm.set_batch_computation_enabled(True)
	labels = svm.apply().get_labels()
	return labels, svm
def features_string_char_compressed (fname):
	from shogun import StringCharFeatures, StringFileCharFeatures, RAWBYTE
	from shogun import UNCOMPRESSED,SNAPPY,LZO,GZIP,BZIP2,LZMA, MSG_DEBUG
	from shogun import DecompressCharString

	f=StringFileCharFeatures(fname, RAWBYTE)

	#print("original strings", f.get_features())

	#uncompressed
	f.save_compressed("tmp/foo_uncompressed.str", UNCOMPRESSED, 1)
	f2=StringCharFeatures(RAWBYTE);
	f2.load_compressed("tmp/foo_uncompressed.str", True)
	#print("uncompressed strings", f2.get_features())
	#print

	# load compressed data and uncompress on load

	#snappy - not stable yet?!
	#f.save_compressed("tmp/foo_snappy.str", SNAPPY, 9)
	#f2=StringCharFeatures(RAWBYTE);
	#f2.load_compressed("tmp/foo_snappy.str", True)
	#print("snappy strings", f2.get_features())
	#print

	#lzo
	f.save_compressed("tmp/foo_lzo.str", LZO, 9)
	f2=StringCharFeatures(RAWBYTE);
	f2.load_compressed("tmp/foo_lzo.str", True)
	#print("lzo strings", f2.get_features())
	#print

	##gzip
	f.save_compressed("tmp/foo_gzip.str", GZIP, 9)
	f2=StringCharFeatures(RAWBYTE);
	f2.load_compressed("tmp/foo_gzip.str", True)
	#print("gzip strings", f2.get_features())
	#print

	#bzip2
	f.save_compressed("tmp/foo_bzip2.str", BZIP2, 9)
	f2=StringCharFeatures(RAWBYTE);
	f2.load_compressed("tmp/foo_bzip2.str", True)
	#print("bzip2 strings", f2.get_features())
	#print

	#lzma
	f.save_compressed("tmp/foo_lzma.str", LZMA, 9)
	f2=StringCharFeatures(RAWBYTE);
	f2.load_compressed("tmp/foo_lzma.str", True)
	#print("lzma strings", f2.get_features())
	#print

	# load compressed data and uncompress via preprocessor
	f2=StringCharFeatures(RAWBYTE);
	f2.load_compressed("tmp/foo_lzo.str", False)
	f2.add_preprocessor(DecompressCharString(LZO))
	f2.apply_preprocessor()
	#print("lzo strings", f2.get_features())
	#print

	# load compressed data and uncompress on-the-fly via preprocessor
	f2=StringCharFeatures(RAWBYTE);
	f2.load_compressed("tmp/foo_lzo.str", False)
	#f2.io.set_loglevel(MSG_DEBUG)
	f2.add_preprocessor(DecompressCharString(LZO))
	f2.enable_on_the_fly_preprocessing()
	#print("lzo strings", f2.get_features())
	#print

	#clean up
	import os
	for f in ['tmp/foo_uncompressed.str', 'tmp/foo_snappy.str', 'tmp/foo_lzo.str', 'tmp/foo_gzip.str',
	'tmp/foo_bzip2.str', 'tmp/foo_lzma.str', 'tmp/foo_lzo.str', 'tmp/foo_lzo.str']:
		if os.path.exists(f):
			os.unlink(f)
def features_string_sliding_window (strings):
	from shogun import StringCharFeatures, DNA
	from shogun import DynamicIntArray

	f=StringCharFeatures([strings], DNA)

	# slide a window of length 5 over features
	# (memory efficient, does not copy strings)
	f.obtain_by_sliding_window(5,1)
	#print(f.get_num_vectors())
	#print(f.get_vector_length(0))
	#print(f.get_vector_length(1))
	#print(f.get_features())

	# slide a window of length 4 over features
	# (memory efficient, does not copy strings)
	f.obtain_by_sliding_window(4,1)
	#print(f.get_num_vectors())
	#print(f.get_vector_length(0))
	#print(f.get_vector_length(1))
	#print(f.get_features())

	# extract string-windows at position 0,6,16,25 of window size 4
	# (memory efficient, does not copy strings)
	f.set_features([s])
	positions=DynamicIntArray()
	positions.append_element(0)
	positions.append_element(6)
	positions.append_element(16)
	positions.append_element(25)

	f.obtain_by_position_list(4,positions)
	#print(f.get_features())

	# now extract windows of size 8 from same positon list
	f.obtain_by_position_list(8,positions)
	#print(f.get_features())
	return f
def tests_check_commwordkernel_memleak (num, order, gap, reverse):
	import gc
	from shogun import Alphabet,StringCharFeatures,StringWordFeatures,DNA
	from shogun import SortWordString, MSG_DEBUG
	from shogun import CommWordStringKernel, IdentityKernelNormalizer
	from numpy import mat

	POS=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT']
	NEG=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT']

	for i in range(10):
		alpha=Alphabet(DNA)
		traindat=StringCharFeatures(alpha)
		traindat.set_features(POS+NEG)
		trainudat=StringWordFeatures(traindat.get_alphabet());
		trainudat.obtain_from_char(traindat, order-1, order, gap, reverse)
		#trainudat.io.set_loglevel(MSG_DEBUG)
		pre = SortWordString()
		#pre.io.set_loglevel(MSG_DEBUG)
		pre.init(trainudat)
		trainudat.add_preprocessor(pre)
		trainudat.apply_preprocessor()
		spec = CommWordStringKernel(10, False)
		spec.set_normalizer(IdentityKernelNormalizer())
		spec.init(trainudat, trainudat)
		K=spec.get_kernel_matrix()

	del POS
	del NEG
	del order
	del gap
	del reverse
	return K
示例#53
0
def create_features(kname, examples, kparam, train_mode, preproc, seq_source, nuc_con):
    """Converts numpy arrays or sequences into shogun features"""

    if kname == 'gauss' or kname == 'linear' or kname == 'poly':
        examples = numpy.array(examples)
        feats = RealFeatures(examples)

    elif kname == 'wd' or kname == 'localalign' or kname == 'localimprove':
        if seq_source == 'dna':
            examples = non_atcg_convert(examples, nuc_con)
            feats = StringCharFeatures(examples, DNA)
        elif seq_source == 'protein':
            examples = non_aminoacid_converter(examples, nuc_con)
            feats = StringCharFeatures(examples, PROTEIN)
        else:
            sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n")
            sys.exit(-1)

    elif kname == 'spec' or kname == 'cumspec':
        if seq_source == 'dna':
            examples = non_atcg_convert(examples, nuc_con)
            feats = StringCharFeatures(examples, DNA)
        elif seq_source == 'protein':
            examples = non_aminoacid_converter(examples, nuc_con)
            feats = StringCharFeatures(examples, PROTEIN)
        else:
            sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n")
            sys.exit(-1)

        wf = StringUlongFeatures( feats.get_alphabet() )
        wf.obtain_from_char(feats, kparam['degree']-1, kparam['degree'], 0, kname=='cumspec')
        del feats

        if train_mode:
            preproc = SortUlongString()
            preproc.init(wf)
        wf.add_preprocessor(preproc)
        ret = wf.apply_preprocessor()
        #assert(ret)

        feats = wf
    elif kname == 'spec2' or kname == 'cumspec2':
        # spectrum kernel on two sequences
        feats = {}
        feats['combined'] = CombinedFeatures()

        reversed = kname=='cumspec2'

        (ex0,ex1) = zip(*examples)

        f0 = StringCharFeatures(list(ex0), DNA)
        wf = StringWordFeatures(f0.get_alphabet())
        wf.obtain_from_char(f0, kparam['degree']-1, kparam['degree'], 0, reversed)
        del f0

        if train_mode:
            preproc = SortWordString()
            preproc.init(wf)
        wf.add_preprocessor(preproc)
        ret = wf.apply_preprocessor()
        assert(ret)
        feats['combined'].append_feature_obj(wf)
        feats['f0'] = wf

        f1 = StringCharFeatures(list(ex1), DNA)
        wf = StringWordFeatures( f1.get_alphabet() )
        wf.obtain_from_char(f1, kparam['degree']-1, kparam['degree'], 0, reversed)
        del f1

        if train_mode:
            preproc = SortWordString()
            preproc.init(wf)
        wf.add_preprocessor(preproc)
        ret = wf.apply_preprocessor()
        assert(ret)
        feats['combined'].append_feature_obj(wf)
        feats['f1'] = wf

    else:
        print 'Unknown kernel %s' % kname

    return (feats,preproc)