def distance_manhattenword(train_fname=traindna, test_fname=testdna, order=3, gap=0, reverse=False): from shogun import StringCharFeatures, StringWordFeatures, DNA from shogun import SortWordString, ManhattanWordDistance, CSVFile charfeat = StringCharFeatures(CSVFile(train_fname), DNA) feats_train = StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) preproc = SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat = StringCharFeatures(CSVFile(test_fname), DNA) feats_test = StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() distance = ManhattanWordDistance(feats_train, feats_train) dm_train = distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test = distance.get_distance_matrix() return dm_train, dm_test
def distance_canberraword (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False): from shogun import StringCharFeatures, StringWordFeatures, DNA from shogun import SortWordString from shogun import CanberraWordDistance charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() distance=CanberraWordDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test
def preprocessor_sortwordstring(fm_train_dna=traindna, fm_test_dna=testdna, order=3, gap=0, reverse=False, use_sign=False): from shogun import CommWordStringKernel from shogun import StringCharFeatures, StringWordFeatures, DNA from shogun import SortWordString charfeat = StringCharFeatures(fm_train_dna, DNA) feats_train = StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) preproc = SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat = StringCharFeatures(fm_test_dna, DNA) feats_test = StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel = CommWordStringKernel(feats_train, feats_train, use_sign) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def init_sensor(self, kernel, svs): f = StringCharFeatures(svs, DNA) kname = kernel['name'] if kname == 'spectrum': wf = StringWordFeatures(f.get_alphabet()) wf.obtain_from_char(f, kernel['order'] - 1, kernel['order'], 0, False) pre = SortWordString() pre.init(wf) wf.add_preprocessor(pre) wf.apply_preprocessor() f = wf k = CommWordStringKernel(0, False) k.set_use_dict_diagonal_optimization(kernel['order'] < 8) self.preproc = pre elif kname == 'wdshift': k = WeightedDegreePositionStringKernel(0, kernel['order']) k.set_normalizer(IdentityKernelNormalizer()) k.set_shifts(kernel['shift'] * numpy.ones(f.get_max_vector_length(), dtype=numpy.int32)) k.set_position_weights(1.0 / f.get_max_vector_length() * numpy.ones(f.get_max_vector_length(), dtype=numpy.float64)) else: raise "Currently, only wdshift and spectrum kernels supported" self.kernel = k self.train_features = f return (self.kernel, self.train_features)
def kernel_weighted_comm_word_string (fm_train_dna=traindat,fm_test_dna=testdat,order=3,gap=0,reverse=True ): from shogun import WeightedCommWordStringKernel from shogun import StringWordFeatures, StringCharFeatures, DNA from shogun import SortWordString charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() use_sign=False kernel=WeightedCommWordStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def distance_hammingword (fm_train_dna=traindna,fm_test_dna=testdna, fm_test_real=testdat,order=3,gap=0,reverse=False,use_sign=False): from shogun import StringCharFeatures, StringWordFeatures, DNA from shogun import SortWordString from shogun import HammingWordDistance charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() distance=HammingWordDistance(feats_train, feats_train, use_sign) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test
def get_spectrum_features(data, order=3, gap=0, reverse=True): """ create feature object used by spectrum kernel """ charfeat = StringCharFeatures(data, DNA) feat = StringWordFeatures(charfeat.get_alphabet()) feat.obtain_from_char(charfeat, order - 1, order, gap, reverse) preproc = SortWordString() preproc.init(feat) feat.add_preprocessor(preproc) feat.apply_preprocessor() return feat
def runShogunSVMDNASpectrumKernel(train_xt, train_lt, test_xt): """ run svm with spectrum kernel """ ################################################## # set up SVM charfeat_train = StringCharFeatures(train_xt, DNA) feats_train = StringWordFeatures(DNA) feats_train.obtain_from_char(charfeat_train, K-1, K, GAP, False) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat_test = StringCharFeatures(test_xt, DNA) feats_test=StringWordFeatures(DNA) feats_test.obtain_from_char(charfeat_test, K-1, K, GAP, False) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel=CommWordStringKernel(feats_train, feats_train, False) kernel.io.set_loglevel(MSG_DEBUG) # init kernel labels = BinaryLabels(train_lt) # run svm model print "Ready to train!" svm=LibSVM(SVMC, kernel, labels) svm.io.set_loglevel(MSG_DEBUG) svm.train() # predictions print "Making predictions!" out1DecisionValues = svm.apply(feats_train) out1=out1DecisionValues.get_labels() kernel.init(feats_train, feats_test) out2DecisionValues = svm.apply(feats_test) out2=out2DecisionValues.get_labels() return out1,out2,out1DecisionValues,out2DecisionValues
def distance_manhattenword (train_fname=traindna,test_fname=testdna,order=3,gap=0,reverse=False): from shogun import StringCharFeatures, StringWordFeatures, DNA from shogun import SortWordString, ManhattanWordDistance, CSVFile charfeat=StringCharFeatures(CSVFile(train_fname), DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(CSVFile(test_fname), DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() distance=ManhattanWordDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return dm_train,dm_test
def tests_check_commwordkernel_memleak (num, order, gap, reverse): import gc from shogun import Alphabet,StringCharFeatures,StringWordFeatures,DNA from shogun import SortWordString, MSG_DEBUG from shogun import CommWordStringKernel, IdentityKernelNormalizer from numpy import mat POS=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT'] NEG=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT'] for i in range(10): alpha=Alphabet(DNA) traindat=StringCharFeatures(alpha) traindat.set_features(POS+NEG) trainudat=StringWordFeatures(traindat.get_alphabet()); trainudat.obtain_from_char(traindat, order-1, order, gap, reverse) #trainudat.io.set_loglevel(MSG_DEBUG) pre = SortWordString() #pre.io.set_loglevel(MSG_DEBUG) pre.init(trainudat) trainudat.add_preprocessor(pre) trainudat.apply_preprocessor() spec = CommWordStringKernel(10, False) spec.set_normalizer(IdentityKernelNormalizer()) spec.init(trainudat, trainudat) K=spec.get_kernel_matrix() del POS del NEG del order del gap del reverse return K
def tests_check_commwordkernel_memleak(num, order, gap, reverse): import gc from shogun import Alphabet, StringCharFeatures, StringWordFeatures, DNA from shogun import SortWordString, MSG_DEBUG from shogun import CommWordStringKernel, IdentityKernelNormalizer from numpy import mat POS = [ num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT' ] NEG = [ num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT' ] for i in range(10): alpha = Alphabet(DNA) traindat = StringCharFeatures(alpha) traindat.set_features(POS + NEG) trainudat = StringWordFeatures(traindat.get_alphabet()) trainudat.obtain_from_char(traindat, order - 1, order, gap, reverse) #trainudat.io.set_loglevel(MSG_DEBUG) pre = SortWordString() #pre.io.set_loglevel(MSG_DEBUG) pre.init(trainudat) trainudat.add_preprocessor(pre) trainudat.apply_preprocessor() spec = CommWordStringKernel(10, False) spec.set_normalizer(IdentityKernelNormalizer()) spec.init(trainudat, trainudat) K = spec.get_kernel_matrix() del POS del NEG del order del gap del reverse return K