def manhattan_word_distance (): print 'ManhattanWordDistance' from shogun.Features import StringCharFeatures, StringWordFeatures, DNA from shogun.PreProc import SortWordString from shogun.Distance import ManhattanWordDistance order=3 gap=0 reverse=False charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() distance=ManhattanWordDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix()
def kernel_comm_word_string_modular(fm_train_dna=traindat, fm_test_dna=testdat, order=3, gap=0, reverse=False, use_sign=False): from shogun.Kernel import CommWordStringKernel from shogun.Features import StringWordFeatures, StringCharFeatures, DNA from shogun.Preprocessor import SortWordString charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train = StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) preproc = SortWordString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test = StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() kernel = CommWordStringKernel(feats_train, feats_train, use_sign) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def sort_word_string (): print 'CommWordString' from shogun.Kernel import CommWordStringKernel from shogun.Features import StringCharFeatures, StringWordFeatures, DNA from shogun.PreProc import SortWordString order=3 gap=0 reverse=False charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() use_sign=False kernel=CommWordStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix()
def preproc_sortwordstring_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False,use_sign=False): from shogun.Kernel import CommWordStringKernel from shogun.Features import StringCharFeatures, StringWordFeatures, DNA from shogun.Preprocessor import SortWordString charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() kernel=CommWordStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def distance_canberraword_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False): from shogun.Features import StringCharFeatures, StringWordFeatures, DNA from shogun.Preprocessor import SortWordString from shogun.Distance import CanberraWordDistance charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() distance=CanberraWordDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test
def create_hashed_features_spectrum(param, data): """ creates hashed dot features for the spectrum kernel """ # extract parameters order = param["degree_spectrum"] # fixed parameters gap = 0 reverse = True normalize = True # create features feats_char = StringCharFeatures(data, DNA) feats_word = StringWordFeatures(feats_char.get_alphabet()) feats_word.obtain_from_char(feats_char, order - 1, order, gap, reverse) # create preproc preproc = SortWordString() preproc.init(feats_word) feats_word.add_preproc(preproc) feats_word.apply_preproc() # finish feats = ImplicitWeightedSpecFeatures(feats_word, normalize) return feats
def init_sensor(self, kernel, svs): f = StringCharFeatures(svs, DNA) kname = kernel['name'] if kname == 'spectrum': wf = StringWordFeatures(f.get_alphabet()) wf.obtain_from_char(f, kernel['order'] - 1, kernel['order'], 0, False) pre = SortWordString() pre.init(wf) wf.add_preproc(pre) wf.apply_preproc() f = wf k = CommWordStringKernel(0, False) k.set_use_dict_diagonal_optimization(kernel['order'] < 8) self.preproc = pre elif kname == 'wdshift': k = WeightedDegreePositionStringKernel(0, kernel['order']) k.set_normalizer(IdentityKernelNormalizer()) k.set_shifts(kernel['shift'] * numpy.ones(f.get_max_vector_length(), dtype=numpy.int32)) k.set_position_weights(1.0 / f.get_max_vector_length() * numpy.ones(f.get_max_vector_length(), dtype=numpy.float64)) else: raise "Currently, only wdshift and spectrum kernels supported" self.kernel = k self.train_features = f return (self.kernel, self.train_features)
def distance_hammingword_modular(fm_train_dna=traindna, fm_test_dna=testdna, fm_test_real=testdat, order=3, gap=0, reverse=False, use_sign=False): from shogun.Features import StringCharFeatures, StringWordFeatures, DNA from shogun.PreProc import SortWordString from shogun.Distance import HammingWordDistance charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train = StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) preproc = SortWordString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test = StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() distance = HammingWordDistance(feats_train, feats_train, use_sign) dm_train = distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test = distance.get_distance_matrix() return distance, dm_train, dm_test
def create_hashed_features_spectrum(param, data): """ creates hashed dot features for the spectrum kernel """ # extract parameters order = param["degree_spectrum"] # fixed parameters gap = 0 reverse = True normalize = True # create features feats_char = StringCharFeatures(data, DNA) feats_word = StringWordFeatures(feats_char.get_alphabet()) feats_word.obtain_from_char(feats_char, order-1, order, gap, reverse) # create preproc preproc = SortWordString() preproc.init(feats_word) feats_word.add_preproc(preproc) feats_word.apply_preproc() # finish feats = ImplicitWeightedSpecFeatures(feats_word, normalize) return feats
def init_sensor(self, kernel, svs): f = StringCharFeatures(svs, DNA) kname = kernel['name'] if kname == 'spectrum': wf = StringWordFeatures(f.get_alphabet()) wf.obtain_from_char(f, kernel['order'] - 1, kernel['order'], 0, False) pre = SortWordString() pre.init(wf) wf.add_preproc(pre) wf.apply_preproc() f = wf k = CommWordStringKernel(0, False) k.set_use_dict_diagonal_optimization(kernel['order'] < 8) self.preproc = pre elif kname == 'wdshift': k = WeightedDegreePositionStringKernel(0, kernel['order']) k.set_normalizer(IdentityKernelNormalizer()) k.set_shifts( kernel['shift'] * numpy.ones(f.get_max_vector_length(), dtype=numpy.int32)) k.set_position_weights( 1.0 / f.get_max_vector_length() * numpy.ones(f.get_max_vector_length(), dtype=numpy.float64)) else: raise "Currently, only wdshift and spectrum kernels supported" self.kernel = k self.train_features = f return (self.kernel, self.train_features)
def perform_clustering(mss_id): import numpy import expenv mss = expenv.MultiSplitSet.get(mss_id) from method_mhc_mkl import SequencesHandler from shogun.Distance import EuclidianDistance, HammingWordDistance from shogun.Features import StringCharFeatures, StringWordFeatures, PROTEIN from shogun.Clustering import Hierarchical from shogun.PreProc import SortWordString order = 1 gap = 0 reverse = False seq_handler = SequencesHandler() data = [seq_handler.get_seq(ss.dataset.organism) for ss in mss.split_sets] charfeat=StringCharFeatures(PROTEIN) charfeat.set_features(data) feats=StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats) feats.add_preproc(preproc) feats.apply_preproc() use_sign = False distance = HammingWordDistance(feats, feats, use_sign) #distance = EuclidianDistance() merges=4 hierarchical=Hierarchical(merges, distance) hierarchical.train() hierarchical.get_merge_distances() hierarchical.get_cluster_pairs() return hierarchical
def perform_clustering(mss_id): import numpy import expenv mss = expenv.MultiSplitSet.get(mss_id) from method_mhc_mkl import SequencesHandler from shogun.Distance import EuclidianDistance, HammingWordDistance from shogun.Features import StringCharFeatures, StringWordFeatures, PROTEIN from shogun.Clustering import Hierarchical from shogun.PreProc import SortWordString order = 1 gap = 0 reverse = False seq_handler = SequencesHandler() data = [seq_handler.get_seq(ss.dataset.organism) for ss in mss.split_sets] charfeat = StringCharFeatures(PROTEIN) charfeat.set_features(data) feats = StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order - 1, order, gap, reverse) preproc = SortWordString() preproc.init(feats) feats.add_preproc(preproc) feats.apply_preproc() use_sign = False distance = HammingWordDistance(feats, feats, use_sign) #distance = EuclidianDistance() merges = 4 hierarchical = Hierarchical(merges, distance) hierarchical.train() hierarchical.get_merge_distances() hierarchical.get_cluster_pairs() return hierarchical
def tests_check_commwordkernel_memleak_modular(num, order, gap, reverse): import gc from shogun.Features import Alphabet,StringCharFeatures,StringWordFeatures,DNA from shogun.Preprocessor import SortWordString, MSG_DEBUG from shogun.Kernel import CommWordStringKernel, IdentityKernelNormalizer from numpy import mat POS=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT'] NEG=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT'] for i in xrange(10): alpha=Alphabet(DNA) traindat=StringCharFeatures(alpha) traindat.set_features(POS+NEG) trainudat=StringWordFeatures(traindat.get_alphabet()); trainudat.obtain_from_char(traindat, order-1, order, gap, reverse) #trainudat.io.set_loglevel(MSG_DEBUG) pre = SortWordString() #pre.io.set_loglevel(MSG_DEBUG) pre.init(trainudat) trainudat.add_preproc(pre) trainudat.apply_preproc() spec = CommWordStringKernel(10, False) spec.set_normalizer(IdentityKernelNormalizer()) spec.init(trainudat, trainudat) K=spec.get_kernel_matrix() del POS del NEG del order del gap del reverse return K
def tests_check_commwordkernel_memleak_modular(num, order, gap, reverse): import gc from shogun.Features import Alphabet, StringCharFeatures, StringWordFeatures, DNA from shogun.PreProc import SortWordString, MSG_DEBUG from shogun.Kernel import CommWordStringKernel, IdentityKernelNormalizer from numpy import mat POS = [ num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT' ] NEG = [ num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT' ] for i in xrange(10): alpha = Alphabet(DNA) traindat = StringCharFeatures(alpha) traindat.set_features(POS + NEG) trainudat = StringWordFeatures(traindat.get_alphabet()) trainudat.obtain_from_char(traindat, order - 1, order, gap, reverse) #trainudat.io.set_loglevel(MSG_DEBUG) pre = SortWordString() #pre.io.set_loglevel(MSG_DEBUG) pre.init(trainudat) trainudat.add_preproc(pre) trainudat.apply_preproc() spec = CommWordStringKernel(10, False) spec.set_normalizer(IdentityKernelNormalizer()) spec.init(trainudat, trainudat) K = spec.get_kernel_matrix() del POS del NEG del order del gap del reverse return K
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT'] order=7 gap=0 reverse=False for i in xrange(10): alpha=Alphabet(DNA) traindat=StringCharFeatures(alpha) traindat.set_features(POS+NEG) trainudat=StringWordFeatures(traindat.get_alphabet()); trainudat.obtain_from_char(traindat, order-1, order, gap, reverse) #trainudat.io.set_loglevel(MSG_DEBUG) pre = SortWordString() #pre.io.set_loglevel(MSG_DEBUG) pre.init(trainudat) trainudat.add_preproc(pre) trainudat.apply_preproc() spec = CommWordStringKernel(10, False) spec.set_normalizer(IdentityKernelNormalizer()) spec.init(trainudat, trainudat) K=mat(spec.get_kernel_matrix()) del POS del NEG del order del gap del reverse