def non_redundant_ulong_features(feats, kmerlen): """convert the features from Shogun toolbox to non-redundant ulong features Arguments: feats -- StringUlongFeatures kmerlen -- integer, length of k-mer Return: StringUlongFeatures after converting reverse complement k-mer ids """ for i in xrange(feats.get_num_vectors()): nf = [get_rcmap(int(kmerid), kmerlen) \ for kmerid in feats.get_feature_vector(i)] feats.set_feature_vector(numpy.array(nf, numpy.dtype('u8')), i) preproc = SortUlongString() preproc.init(feats) try: feats.add_preproc(preproc) feats.apply_preproc() except AttributeError: feats.add_preprocessor(preproc) feats.apply_preprocessor() return feats
def kernel_comm_ulong_string_modular(fm_train_dna=traindat, fm_test_dna=testdat, order=3, gap=0, reverse=False): from shogun.Kernel import CommUlongStringKernel from shogun.Features import StringUlongFeatures, StringCharFeatures, DNA from shogun.PreProc import SortUlongString charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train = StringUlongFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) preproc = SortUlongString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test = StringUlongFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() use_sign = False kernel = CommUlongStringKernel(feats_train, feats_train, use_sign) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def comm_ulong_string (): print 'CommUlongString' from shogun.Kernel import CommUlongStringKernel from shogun.Features import StringUlongFeatures, StringCharFeatures, DNA from shogun.PreProc import SortUlongString order=3 gap=0 reverse=False charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringUlongFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortUlongString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringUlongFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() use_sign=False kernel=CommUlongStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix()
def get_kernel_matrix(li): """ Get kernel matrix from a list of strings. """ order = 6 gap = 2 reverse = False charfeat = StringCharFeatures(RAWBYTE) charfeat.set_features(li) #Get alphabet. feats_train = StringUlongFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) #CommUlongStringKernel needs sorted features. preproc = SortUlongString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() use_sign = False #Compute kernel matrix between train features. kernel = CommUlongStringKernel(feats_train, feats_train, use_sign) km_train = kernel.get_kernel_matrix() return km_train
def preproc_sortulongstring_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False,use_sign=False): from shogun.Kernel import CommUlongStringKernel from shogun.Features import StringCharFeatures, StringUlongFeatures, DNA from shogun.PreProc import SortUlongString charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringUlongFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringUlongFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortUlongString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() feats_test.add_preproc(preproc) feats_test.apply_preproc() kernel=CommUlongStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def get_kernel_matrix(li): """ Get kernel matrix from a list of strings. """ order = 6 gap = 2 reverse = False charfeat = StringCharFeatures(RAWBYTE) charfeat.set_features(li) #Get alphabet. feats_train = StringUlongFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) #CommUlongStringKernel needs sorted features. preproc = SortUlongString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() use_sign = False #Compute kernel matrix between train features. kernel = CommUlongStringKernel(feats_train, feats_train, use_sign) km_train = kernel.get_kernel_matrix() return km_train
def create_features(kname, examples, kparam, train_mode, preproc, seq_source, nuc_con): """Converts numpy arrays or sequences into shogun features""" if kname == 'gauss' or kname == 'linear' or kname == 'poly': examples = numpy.array(examples) feats = RealFeatures(examples) elif kname == 'wd' or kname == 'localalign' or kname == 'localimprove': if seq_source == 'dna': examples = non_atcg_convert(examples, nuc_con) feats = StringCharFeatures(examples, DNA) elif seq_source == 'protein': examples = non_aminoacid_converter(examples, nuc_con) feats = StringCharFeatures(examples, PROTEIN) else: sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n") sys.exit(-1) elif kname == 'spec' or kname == 'cumspec': if seq_source == 'dna': examples = non_atcg_convert(examples, nuc_con) feats = StringCharFeatures(examples, DNA) elif seq_source == 'protein': examples = non_aminoacid_converter(examples, nuc_con) feats = StringCharFeatures(examples, PROTEIN) else: sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n") sys.exit(-1) wf = StringUlongFeatures( feats.get_alphabet() ) wf.obtain_from_char(feats, kparam['degree']-1, kparam['degree'], 0, kname=='cumspec') del feats if train_mode: preproc = SortUlongString() preproc.init(wf) wf.add_preproc(preproc) ret = wf.apply_preproc() #assert(ret) feats = wf elif kname == 'spec2' or kname == 'cumspec2': # spectrum kernel on two sequences feats = {} feats['combined'] = CombinedFeatures() reversed = kname=='cumspec2' (ex0,ex1) = zip(*examples) f0 = StringCharFeatures(list(ex0), DNA) wf = StringWordFeatures(f0.get_alphabet()) wf.obtain_from_char(f0, kparam['degree']-1, kparam['degree'], 0, reversed) del f0 if train_mode: preproc = SortWordString() preproc.init(wf) wf.add_preprocessor(preproc) ret = wf.apply_preprocessors() assert(ret) feats['combined'].append_feature_obj(wf) feats['f0'] = wf f1 = StringCharFeatures(list(ex1), DNA) wf = StringWordFeatures( f1.get_alphabet() ) wf.obtain_from_char(f1, kparam['degree']-1, kparam['degree'], 0, reversed) del f1 if train_mode: preproc = SortWordString() preproc.init(wf) wf.add_preproc(preproc) ret = wf.apply_preproc() assert(ret) feats['combined'].append_feature_obj(wf) feats['f1'] = wf else: print 'Unknown kernel %s' % kname return (feats,preproc)
def normal_ulong_feature(feats): preproc = SortUlongString() preproc.init(feats) feats.add_preprocessor(preproc) feats.apply_preprocessor() return feats