def kernel_salzberg_word_string(fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, order=3, gap=0, reverse=False): from shogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels from shogun import SalzbergWordStringKernel from shogun import PluginEstimate charfeat = StringCharFeatures(fm_train_dna, DNA) feats_train = StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) charfeat = StringCharFeatures(fm_test_dna, DNA) feats_test = StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) pie = PluginEstimate() labels = BinaryLabels(label_train_dna) pie.set_labels(labels) pie.set_features(feats_train) pie.train() kernel = SalzbergWordStringKernel(feats_train, feats_train, pie, labels) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) pie.set_features(feats_test) pie.apply().get_labels() km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def kernel_match_word_string(fm_train_dna=traindat, fm_test_dna=testdat, degree=3, scale=1.4, size_cache=10, order=3, gap=0, reverse=False): from shogun import MatchWordStringKernel, AvgDiagKernelNormalizer from shogun import StringWordFeatures, StringCharFeatures, DNA charfeat = StringCharFeatures(fm_train_dna, DNA) feats_train = StringWordFeatures(DNA) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) charfeat = StringCharFeatures(fm_test_dna, DNA) feats_test = StringWordFeatures(DNA) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) kernel = MatchWordStringKernel(size_cache, degree) kernel.set_normalizer(AvgDiagKernelNormalizer(scale)) kernel.init(feats_train, feats_train) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def classifier_ssk(fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, C=1, maxlen=1, decay=1): from shogun import StringCharFeatures, BinaryLabels from shogun import LibSVM, SubsequenceStringKernel, DNA from shogun import ErrorRateMeasure feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_test_dna, DNA) labels = BinaryLabels(label_train_dna) kernel = SubsequenceStringKernel(feats_train, feats_train, maxlen, decay) svm = LibSVM(C, kernel, labels) svm.train() out = svm.apply(feats_train) evaluator = ErrorRateMeasure() trainerr = evaluator.evaluate(out, labels) # print(trainerr) kernel.init(feats_train, feats_test) predicted_labels = svm.apply(feats_test).get_labels() # print predicted_labels return predicted_labels
def distance_hammingword (fm_train_dna=traindna,fm_test_dna=testdna, fm_test_real=testdat,order=3,gap=0,reverse=False,use_sign=False): from shogun import StringCharFeatures, StringWordFeatures, DNA from shogun import SortWordString from shogun import HammingWordDistance charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() distance=HammingWordDistance(feats_train, feats_train, use_sign) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test
def classifier_svmlight_linear_term (fm_train_dna=traindna,fm_test_dna=testdna, \ label_train_dna=label_traindna,degree=3, \ C=10,epsilon=1e-5,num_threads=1): from shogun import StringCharFeatures, BinaryLabels, DNA from shogun import WeightedDegreeStringKernel try: from shogun import SVMLight except ImportError: print("SVMLight is not available") exit(0) feats_train=StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test=StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) labels=BinaryLabels(label_train_dna) svm=SVMLight(C, kernel, labels) svm.set_qpsize(3) svm.set_linear_term(-numpy.array([1,2,3,4,5,6,7,8,7,6], dtype=numpy.double)); svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) out = svm.apply().get_labels() return out,kernel
def kernel_weighted_degree_string (fm_train_dna=traindat,fm_test_dna=testdat,degree=20): from shogun import StringCharFeatures, DNA from shogun import WeightedDegreeStringKernel, MSG_DEBUG feats_train=StringCharFeatures(fm_train_dna, DNA) #feats_train.io.set_loglevel(MSG_DEBUG) feats_test=StringCharFeatures(fm_test_dna, DNA) kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) from numpy import arange,double weights=arange(1,degree+1,dtype=double)[::-1]/ \ sum(arange(1,degree+1,dtype=double)) kernel.set_wd_weights(weights) #from numpy import ones,float64,int32 #kernel.set_position_weights(ones(len(fm_train_dna[0]), dtype=float64)) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() #this is how to serializate the kernel #import pickle #pickle.dump(kernel, file('tmp/kernel_obj.dump','w'), protocol=2) #k=pickle.load(file('tmp/kernel_obj.dump','r')) return km_train, km_test, kernel
def kernel_histogram_word_string(fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, order=3, ppseudo_count=1, npseudo_count=1): from shogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels from shogun import HistogramWordStringKernel, AvgDiagKernelNormalizer from shogun import PluginEstimate #, MSG_DEBUG charfeat = StringCharFeatures(DNA) #charfeat.io.set_loglevel(MSG_DEBUG) charfeat.set_features(fm_train_dna) feats_train = StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, 0, False) charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test = StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order - 1, order, 0, False) pie = PluginEstimate(ppseudo_count, npseudo_count) labels = BinaryLabels(label_train_dna) pie.set_labels(labels) pie.set_features(feats_train) pie.train() kernel = HistogramWordStringKernel(feats_train, feats_train, pie) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) pie.set_features(feats_test) pie.apply().get_labels() km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def kernel_poly_match_word_string(fm_train_dna=traindat, fm_test_dna=testdat, degree=2, inhomogene=True, order=3, gap=0, reverse=False): from shogun import PolyMatchWordStringKernel from shogun import StringWordFeatures, StringCharFeatures, DNA charfeat = StringCharFeatures(fm_train_dna, DNA) feats_train = StringWordFeatures(DNA) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) charfeat = StringCharFeatures(fm_test_dna, DNA) feats_test = StringWordFeatures(DNA) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) kernel = PolyMatchWordStringKernel(feats_train, feats_train, degree, inhomogene) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def distance_manhattenword(train_fname=traindna, test_fname=testdna, order=3, gap=0, reverse=False): from shogun import StringCharFeatures, StringWordFeatures, DNA from shogun import SortWordString, ManhattanWordDistance, CSVFile charfeat = StringCharFeatures(CSVFile(train_fname), DNA) feats_train = StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) preproc = SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat = StringCharFeatures(CSVFile(test_fname), DNA) feats_test = StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() distance = ManhattanWordDistance(feats_train, feats_train) dm_train = distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test = distance.get_distance_matrix() return dm_train, dm_test
def preprocessor_sortwordstring(fm_train_dna=traindna, fm_test_dna=testdna, order=3, gap=0, reverse=False, use_sign=False): from shogun import CommWordStringKernel from shogun import StringCharFeatures, StringWordFeatures, DNA from shogun import SortWordString charfeat = StringCharFeatures(fm_train_dna, DNA) feats_train = StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) preproc = SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat = StringCharFeatures(fm_test_dna, DNA) feats_test = StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel = CommWordStringKernel(feats_train, feats_train, use_sign) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def classifier_domainadaptationsvm (fm_train_dna=traindna,fm_test_dna=testdna, \ label_train_dna=label_traindna, \ label_test_dna=label_testdna,fm_train_dna2=traindna2,fm_test_dna2=testdna2, \ label_train_dna2=label_traindna2,label_test_dna2=label_testdna2,C=1,degree=3): feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_test_dna, DNA) kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels = BinaryLabels(label_train_dna) svm = SVMLight(C, kernel, labels) svm.train() #svm.io.set_loglevel(MSG_DEBUG) ##################################### #print("obtaining DA SVM from previously trained SVM") feats_train2 = StringCharFeatures(fm_train_dna, DNA) feats_test2 = StringCharFeatures(fm_test_dna, DNA) kernel2 = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels2 = BinaryLabels(label_train_dna) # we regularize against the previously obtained solution dasvm = DomainAdaptationSVM(C, kernel2, labels2, svm, 1.0) dasvm.train() out = dasvm.apply_binary(feats_test2) return out #,dasvm TODO
def classifier_svmlight(fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, C=1.2, epsilon=1e-5, num_threads=1): from shogun import StringCharFeatures, BinaryLabels, DNA from shogun import WeightedDegreeStringKernel try: from shogun import SVMLight except ImportError: print('No support for SVMLight available.') return feats_train = StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test = StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) degree = 20 kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels = BinaryLabels(label_train_dna) svm = SVMLight(C, kernel, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) svm.apply().get_labels() return kernel
def kernel_combined (fm_train_real=traindat,fm_test_real=testdat,fm_train_dna=traindna,fm_test_dna=testdna ): from shogun import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel from shogun import RealFeatures, StringCharFeatures, CombinedFeatures, DNA kernel=CombinedKernel() feats_train=CombinedFeatures() feats_test=CombinedFeatures() subkfeats_train=RealFeatures(fm_train_real) subkfeats_test=RealFeatures(fm_test_real) subkernel=GaussianKernel(10, 1.1) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train=StringCharFeatures(fm_train_dna, DNA) subkfeats_test=StringCharFeatures(fm_test_dna, DNA) degree=3 subkernel=FixedDegreeStringKernel(10, degree) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train=StringCharFeatures(fm_train_dna, DNA) subkfeats_test=StringCharFeatures(fm_test_dna, DNA) subkernel=LocalAlignmentStringKernel(10) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def kernel_comm_ulong_string(fm_train_dna=traindat, fm_test_dna=testdat, order=3, gap=0, reverse=False): from shogun import CommUlongStringKernel from shogun import StringUlongFeatures, StringCharFeatures, DNA from shogun import SortUlongString charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train = StringUlongFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) preproc = SortUlongString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test = StringUlongFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() use_sign = False kernel = CommUlongStringKernel(feats_train, feats_train, use_sign) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def kernel_top(fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, pseudo=1e-1, order=1, gap=0, reverse=False, kargs=[1, False, True]): from shogun import StringCharFeatures, StringWordFeatures, TOPFeatures, DNA from shogun import PolyKernel from shogun import HMM, BW_NORMAL N = 1 # toy HMM with 1 state M = 4 # 4 observations -> DNA # train HMM for positive class charfeat = StringCharFeatures(fm_hmm_pos, DNA) hmm_pos_train = StringWordFeatures(charfeat.get_alphabet()) hmm_pos_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) pos = HMM(hmm_pos_train, N, M, pseudo) pos.baum_welch_viterbi_train(BW_NORMAL) # train HMM for negative class charfeat = StringCharFeatures(fm_hmm_neg, DNA) hmm_neg_train = StringWordFeatures(charfeat.get_alphabet()) hmm_neg_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) neg = HMM(hmm_neg_train, N, M, pseudo) neg.baum_welch_viterbi_train(BW_NORMAL) # Kernel training data charfeat = StringCharFeatures(fm_train_dna, DNA) wordfeats_train = StringWordFeatures(charfeat.get_alphabet()) wordfeats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) # Kernel testing data charfeat = StringCharFeatures(fm_test_dna, DNA) wordfeats_test = StringWordFeatures(charfeat.get_alphabet()) wordfeats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) # get kernel on training data pos.set_observations(wordfeats_train) neg.set_observations(wordfeats_train) feats_train = TOPFeatures(10, pos, neg, False, False) kernel = PolyKernel(feats_train, feats_train, *kargs) km_train = kernel.get_kernel_matrix() # get kernel on testing data pos_clone = HMM(pos) neg_clone = HMM(neg) pos_clone.set_observations(wordfeats_test) neg_clone.set_observations(wordfeats_test) feats_test = TOPFeatures(10, pos_clone, neg_clone, False, False) kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def kernel_fisher (fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, N=1,M=4,pseudo=1e-1,order=1,gap=0,reverse=False, kargs=[1,False,True]): from shogun import StringCharFeatures, StringWordFeatures, FKFeatures, DNA from shogun import PolyKernel from shogun import HMM, BW_NORMAL#, MSG_DEBUG # train HMM for positive class charfeat=StringCharFeatures(fm_hmm_pos, DNA) #charfeat.io.set_loglevel(MSG_DEBUG) hmm_pos_train=StringWordFeatures(charfeat.get_alphabet()) hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse) pos=HMM(hmm_pos_train, N, M, pseudo) pos.baum_welch_viterbi_train(BW_NORMAL) # train HMM for negative class charfeat=StringCharFeatures(fm_hmm_neg, DNA) hmm_neg_train=StringWordFeatures(charfeat.get_alphabet()) hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse) neg=HMM(hmm_neg_train, N, M, pseudo) neg.baum_welch_viterbi_train(BW_NORMAL) # Kernel training data charfeat=StringCharFeatures(fm_train_dna, DNA) wordfeats_train=StringWordFeatures(charfeat.get_alphabet()) wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) # Kernel testing data charfeat=StringCharFeatures(fm_test_dna, DNA) wordfeats_test=StringWordFeatures(charfeat.get_alphabet()) wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) # get kernel on training data pos.set_observations(wordfeats_train) neg.set_observations(wordfeats_train) feats_train=FKFeatures(10, pos, neg) feats_train.set_opt_a(-1) #estimate prior kernel=PolyKernel(feats_train, feats_train, *kargs) km_train=kernel.get_kernel_matrix() # get kernel on testing data pos_clone=HMM(pos) neg_clone=HMM(neg) pos_clone.set_observations(wordfeats_test) neg_clone.set_observations(wordfeats_test) feats_test=FKFeatures(10, pos_clone, neg_clone) feats_test.set_a(feats_train.get_a()) #use prior from training data kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def get_wd_features(data, feat_type="dna"): """ create feature object for wdk """ if feat_type == "dna": feat = StringCharFeatures(DNA) elif feat_type == "protein": feat = StringCharFeatures(PROTEIN) else: raise Exception("unknown feature type") feat.set_features(data) return feat
def kernel_linear_string(fm_train_dna=traindat, fm_test_dna=testdat): from shogun import StringCharFeatures, DNA from shogun import LinearStringKernel feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_test_dna, DNA) kernel = LinearStringKernel(feats_train, feats_train) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def predict(self, data): from shogun import SNP, RAWBYTE from shogun import StringCharFeatures if self.isSNP: feats_test = StringCharFeatures(data, SNP) else: feats_test = StringCharFeatures(data, RAWBYTE) # 将测试string数据转化为中间量 self.kernel.init(self.feats_train, feats_test) feats_test = self.kernel.get_kernel_matrix() result = self.clf.predict(feats_test.T) print ' '.join(map(str, result)) return result
def kernel_fixed_degree_string (fm_train_dna=traindat, fm_test_dna=testdat,degree=3): from shogun import StringCharFeatures, DNA from shogun import FixedDegreeStringKernel feats_train=StringCharFeatures(fm_train_dna, DNA) feats_test=StringCharFeatures(fm_test_dna, DNA) kernel=FixedDegreeStringKernel(feats_train, feats_train, degree) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def distribution_hmm(fm_cube, N, M, pseudo, order, gap, reverse, num_examples): from shogun import StringWordFeatures, StringCharFeatures, CUBE from shogun import HMM, BW_NORMAL charfeat = StringCharFeatures(CUBE) charfeat.set_features(fm_cube) feats = StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order - 1, order, gap, reverse) hmm = HMM(feats, N, M, pseudo) hmm.train() hmm.baum_welch_viterbi_train(BW_NORMAL) num_examples = feats.get_num_vectors() num_param = hmm.get_num_model_parameters() for i in range(num_examples): for j in range(num_param): hmm.get_log_derivative(j, i) best_path = 0 best_path_state = 0 for i in range(num_examples): best_path += hmm.best_path(i) for j in range(N): best_path_state += hmm.get_best_path_state(i, j) lik_example = hmm.get_log_likelihood() lik_sample = hmm.get_log_likelihood_sample() return lik_example, lik_sample, hmm
def distribution_linearhmm(fm_dna=traindna, order=3, gap=0, reverse=False): from shogun import StringWordFeatures, StringCharFeatures, DNA from shogun import LinearHMM charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats = StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order - 1, order, gap, reverse) hmm = LinearHMM(feats) hmm.train() hmm.get_transition_probs() num_examples = feats.get_num_vectors() num_param = hmm.get_num_model_parameters() for i in range(num_examples): for j in range(num_param): hmm.get_log_derivative(j, i) out_likelihood = hmm.get_log_likelihood() out_sample = hmm.get_log_likelihood_sample() return hmm, out_likelihood, out_sample
def features_string_file(directory, fname): from shogun import StringCharFeatures, RAWBYTE from shogun import CSVFile # load features from directory f = StringCharFeatures(RAWBYTE) f.load_from_directory(directory) #and output several stats #print("max string length", f.get_max_vector_length()) #print("number of strings", f.get_num_vectors()) #print("length of first string", f.get_vector_length(0)) #print("str[0,0:3]", f.get_feature(0,0), f.get_feature(0,1), f.get_feature(0,2)) #print("len(str[0])", f.get_vector_length(0)) #print("str[0]", f.get_feature_vector(0)) #or load features from file (one string per line) fil = CSVFile(fname) f.load(fil) #print(f.get_features()) #or load fasta file #f.load_fasta('fasta.fa') #print(f.get_features()) return f.get_features(), f
def init_sensor(self, kernel, svs): f = StringCharFeatures(svs, DNA) kname = kernel['name'] if kname == 'spectrum': wf = StringWordFeatures(f.get_alphabet()) wf.obtain_from_char(f, kernel['order'] - 1, kernel['order'], 0, False) pre = SortWordString() pre.init(wf) wf.add_preprocessor(pre) wf.apply_preprocessor() f = wf k = CommWordStringKernel(0, False) k.set_use_dict_diagonal_optimization(kernel['order'] < 8) self.preproc = pre elif kname == 'wdshift': k = WeightedDegreePositionStringKernel(0, kernel['order']) k.set_normalizer(IdentityKernelNormalizer()) k.set_shifts(kernel['shift'] * numpy.ones(f.get_max_vector_length(), dtype=numpy.int32)) k.set_position_weights(1.0 / f.get_max_vector_length() * numpy.ones(f.get_max_vector_length(), dtype=numpy.float64)) else: raise "Currently, only wdshift and spectrum kernels supported" self.kernel = k self.train_features = f return (self.kernel, self.train_features)
def create_distance_matrix(full_essays, ids): string_features = StringCharFeatures(full_essays, RAWBYTE) sk = SubsequenceStringKernel(string_features, string_features, 3, 0.5) sk_matrix = sk.get_kernel_matrix() sk_df = pd.DataFrame(sk_matrix) sk_df.columns = ['id_' + str(i) for i in ids] return (sk_df)
def get_predictions_from_seqdict(self, seqdic, site): """ we need to generate a huge test features object containing all locations found in each seqdict-sequence and each location (this is necessary to efficiently (==fast,low memory) compute the splice outputs """ seqlen=self.window_right+self.window_left+2 for s in seqdic: position_list=DynamicIntArray() sequence=s.seq positions=s.preds[site].positions for j in xrange(len(positions)): i=positions[j] - self.offset -self.window_left position_list.append_element(i) t=StringCharFeatures([sequence], DNA) t.obtain_by_position_list(seqlen, position_list) self.wd_kernel.init(self.traindat, t) self.wd_kernel.io.enable_progress() l=self.svm.apply().get_values() self.wd_kernel.cleanup() sys.stdout.write("\n...done...\n") num=len(s.preds[site].positions) scores= num * [0] for j in xrange(num): scores[j]=l[j] s.preds[site].set_scores(scores)
def features_hasheddocdot(strings): from shogun import StringCharFeatures, RAWBYTE from shogun import HashedDocDotFeatures from shogun import NGramTokenizer from numpy import array #create string features f = StringCharFeatures(strings, RAWBYTE) #set the number of bits of the target dimension #means a dim of size 2^5=32 num_bits = 5 #create the ngram tokenizer of size 8 to parse the strings tokenizer = NGramTokenizer(8) #normalize results normalize = True #create HashedDocDot features hddf = HashedDocDotFeatures(num_bits, f, tokenizer, normalize) #should expect 32 #print('Feature space dimensionality is', hddf.get_dim_feature_space()) #print('Self dot product of string 0', hddf.dot(0, hddf, 0)) return hddf
def kernel_distantsegments(fm_train_dna=traindat, fm_test_dna=testdat, delta=5, theta=5): from shogun import StringCharFeatures, DNA from shogun import DistantSegmentsKernel feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_test_dna, DNA) kernel = DistantSegmentsKernel(feats_train, feats_train, 10, delta, theta) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def score(self, data, label): from shogun import SNP, RAWBYTE from shogun import StringCharFeatures from sklearn.metrics import accuracy_score from sklearn.metrics import f1_score if self.isSNP: feats_test = StringCharFeatures(data, SNP) else: feats_test = StringCharFeatures(data, RAWBYTE) # 将测试string数据转化为中间量 self.kernel.init(self.feats_train, feats_test) feats_test = self.kernel.get_kernel_matrix() retult = self.clf.predict(feats_test.T) acc = accuracy_score(label, retult) f1 = f1_score(label, retult, average='macro') print '正确率是:' + str(acc), 'F1得分是:' + str(f1) return acc, f1
def kernel_poly_match_string(fm_train_dna=traindat, fm_test_dna=testdat, degree=3, inhomogene=False): from shogun import PolyMatchStringKernel from shogun import StringCharFeatures, DNA feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_train_dna, DNA) kernel = PolyMatchStringKernel(feats_train, feats_train, degree, inhomogene) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel