示例#1
0
def kernel_salzberg_word_string(fm_train_dna=traindat,
                                fm_test_dna=testdat,
                                label_train_dna=label_traindat,
                                order=3,
                                gap=0,
                                reverse=False):
    from shogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels
    from shogun import SalzbergWordStringKernel
    from shogun import PluginEstimate

    charfeat = StringCharFeatures(fm_train_dna, DNA)
    feats_train = StringWordFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    charfeat = StringCharFeatures(fm_test_dna, DNA)
    feats_test = StringWordFeatures(charfeat.get_alphabet())
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    pie = PluginEstimate()
    labels = BinaryLabels(label_train_dna)
    pie.set_labels(labels)
    pie.set_features(feats_train)
    pie.train()

    kernel = SalzbergWordStringKernel(feats_train, feats_train, pie, labels)
    km_train = kernel.get_kernel_matrix()

    kernel.init(feats_train, feats_test)
    pie.set_features(feats_test)
    pie.apply().get_labels()
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
示例#2
0
def kernel_match_word_string(fm_train_dna=traindat,
                             fm_test_dna=testdat,
                             degree=3,
                             scale=1.4,
                             size_cache=10,
                             order=3,
                             gap=0,
                             reverse=False):
    from shogun import MatchWordStringKernel, AvgDiagKernelNormalizer
    from shogun import StringWordFeatures, StringCharFeatures, DNA

    charfeat = StringCharFeatures(fm_train_dna, DNA)
    feats_train = StringWordFeatures(DNA)
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    charfeat = StringCharFeatures(fm_test_dna, DNA)
    feats_test = StringWordFeatures(DNA)
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    kernel = MatchWordStringKernel(size_cache, degree)
    kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
    kernel.init(feats_train, feats_train)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
示例#3
0
def classifier_ssk(fm_train_dna=traindat,
                   fm_test_dna=testdat,
                   label_train_dna=label_traindat,
                   C=1,
                   maxlen=1,
                   decay=1):
    from shogun import StringCharFeatures, BinaryLabels
    from shogun import LibSVM, SubsequenceStringKernel, DNA
    from shogun import ErrorRateMeasure

    feats_train = StringCharFeatures(fm_train_dna, DNA)
    feats_test = StringCharFeatures(fm_test_dna, DNA)
    labels = BinaryLabels(label_train_dna)
    kernel = SubsequenceStringKernel(feats_train, feats_train, maxlen, decay)

    svm = LibSVM(C, kernel, labels)
    svm.train()

    out = svm.apply(feats_train)
    evaluator = ErrorRateMeasure()
    trainerr = evaluator.evaluate(out, labels)
    # print(trainerr)

    kernel.init(feats_train, feats_test)
    predicted_labels = svm.apply(feats_test).get_labels()
    # print predicted_labels

    return predicted_labels
示例#4
0
def distance_hammingword (fm_train_dna=traindna,fm_test_dna=testdna,
		fm_test_real=testdat,order=3,gap=0,reverse=False,use_sign=False):

	from shogun import StringCharFeatures, StringWordFeatures, DNA
	from shogun import SortWordString
	from shogun import HammingWordDistance

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	distance=HammingWordDistance(feats_train, feats_train, use_sign)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return distance,dm_train,dm_test
def classifier_svmlight_linear_term (fm_train_dna=traindna,fm_test_dna=testdna, \
                                                label_train_dna=label_traindna,degree=3, \
                                                C=10,epsilon=1e-5,num_threads=1):

    from shogun import StringCharFeatures, BinaryLabels, DNA
    from shogun import WeightedDegreeStringKernel
    try:
    	from shogun import SVMLight
    except ImportError:
    	print("SVMLight is not available")
    	exit(0)

    feats_train=StringCharFeatures(DNA)
    feats_train.set_features(fm_train_dna)
    feats_test=StringCharFeatures(DNA)
    feats_test.set_features(fm_test_dna)

    kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)

    labels=BinaryLabels(label_train_dna)

    svm=SVMLight(C, kernel, labels)
    svm.set_qpsize(3)
    svm.set_linear_term(-numpy.array([1,2,3,4,5,6,7,8,7,6], dtype=numpy.double));
    svm.set_epsilon(epsilon)
    svm.parallel.set_num_threads(num_threads)
    svm.train()

    kernel.init(feats_train, feats_test)
    out = svm.apply().get_labels()
    return out,kernel
示例#6
0
def kernel_weighted_degree_string (fm_train_dna=traindat,fm_test_dna=testdat,degree=20):
	from shogun import StringCharFeatures, DNA
	from shogun import WeightedDegreeStringKernel, MSG_DEBUG

	feats_train=StringCharFeatures(fm_train_dna, DNA)
	#feats_train.io.set_loglevel(MSG_DEBUG)
	feats_test=StringCharFeatures(fm_test_dna, DNA)

	kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)

	from numpy import arange,double
	weights=arange(1,degree+1,dtype=double)[::-1]/ \
		sum(arange(1,degree+1,dtype=double))
	kernel.set_wd_weights(weights)
	#from numpy import ones,float64,int32
	#kernel.set_position_weights(ones(len(fm_train_dna[0]), dtype=float64))

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

    #this is how to serializate the kernel
	#import pickle
	#pickle.dump(kernel, file('tmp/kernel_obj.dump','w'), protocol=2)
	#k=pickle.load(file('tmp/kernel_obj.dump','r'))


	return km_train, km_test, kernel
def kernel_histogram_word_string(fm_train_dna=traindat,
                                 fm_test_dna=testdat,
                                 label_train_dna=label_traindat,
                                 order=3,
                                 ppseudo_count=1,
                                 npseudo_count=1):

    from shogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels
    from shogun import HistogramWordStringKernel, AvgDiagKernelNormalizer
    from shogun import PluginEstimate  #, MSG_DEBUG

    charfeat = StringCharFeatures(DNA)
    #charfeat.io.set_loglevel(MSG_DEBUG)
    charfeat.set_features(fm_train_dna)
    feats_train = StringWordFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, 0, False)

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_test_dna)
    feats_test = StringWordFeatures(charfeat.get_alphabet())
    feats_test.obtain_from_char(charfeat, order - 1, order, 0, False)

    pie = PluginEstimate(ppseudo_count, npseudo_count)
    labels = BinaryLabels(label_train_dna)
    pie.set_labels(labels)
    pie.set_features(feats_train)
    pie.train()

    kernel = HistogramWordStringKernel(feats_train, feats_train, pie)
    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    pie.set_features(feats_test)
    pie.apply().get_labels()
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
示例#8
0
def kernel_poly_match_word_string(fm_train_dna=traindat,
                                  fm_test_dna=testdat,
                                  degree=2,
                                  inhomogene=True,
                                  order=3,
                                  gap=0,
                                  reverse=False):
    from shogun import PolyMatchWordStringKernel
    from shogun import StringWordFeatures, StringCharFeatures, DNA

    charfeat = StringCharFeatures(fm_train_dna, DNA)
    feats_train = StringWordFeatures(DNA)
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    charfeat = StringCharFeatures(fm_test_dna, DNA)
    feats_test = StringWordFeatures(DNA)
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    kernel = PolyMatchWordStringKernel(feats_train, feats_train, degree,
                                       inhomogene)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
示例#9
0
def distance_manhattenword(train_fname=traindna,
                           test_fname=testdna,
                           order=3,
                           gap=0,
                           reverse=False):
    from shogun import StringCharFeatures, StringWordFeatures, DNA
    from shogun import SortWordString, ManhattanWordDistance, CSVFile

    charfeat = StringCharFeatures(CSVFile(train_fname), DNA)
    feats_train = StringWordFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    preproc = SortWordString()
    preproc.init(feats_train)
    feats_train.add_preprocessor(preproc)
    feats_train.apply_preprocessor()

    charfeat = StringCharFeatures(CSVFile(test_fname), DNA)
    feats_test = StringWordFeatures(charfeat.get_alphabet())
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    feats_test.add_preprocessor(preproc)
    feats_test.apply_preprocessor()

    distance = ManhattanWordDistance(feats_train, feats_train)

    dm_train = distance.get_distance_matrix()
    distance.init(feats_train, feats_test)
    dm_test = distance.get_distance_matrix()
    return dm_train, dm_test
def preprocessor_sortwordstring(fm_train_dna=traindna,
                                fm_test_dna=testdna,
                                order=3,
                                gap=0,
                                reverse=False,
                                use_sign=False):

    from shogun import CommWordStringKernel
    from shogun import StringCharFeatures, StringWordFeatures, DNA
    from shogun import SortWordString

    charfeat = StringCharFeatures(fm_train_dna, DNA)
    feats_train = StringWordFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    preproc = SortWordString()
    preproc.init(feats_train)
    feats_train.add_preprocessor(preproc)
    feats_train.apply_preprocessor()

    charfeat = StringCharFeatures(fm_test_dna, DNA)
    feats_test = StringWordFeatures(charfeat.get_alphabet())
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    feats_test.add_preprocessor(preproc)
    feats_test.apply_preprocessor()

    kernel = CommWordStringKernel(feats_train, feats_train, use_sign)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()

    return km_train, km_test, kernel
示例#11
0
def classifier_domainadaptationsvm (fm_train_dna=traindna,fm_test_dna=testdna, \
                                                label_train_dna=label_traindna, \
                                               label_test_dna=label_testdna,fm_train_dna2=traindna2,fm_test_dna2=testdna2, \
                                               label_train_dna2=label_traindna2,label_test_dna2=label_testdna2,C=1,degree=3):

    feats_train = StringCharFeatures(fm_train_dna, DNA)
    feats_test = StringCharFeatures(fm_test_dna, DNA)
    kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree)
    labels = BinaryLabels(label_train_dna)
    svm = SVMLight(C, kernel, labels)
    svm.train()
    #svm.io.set_loglevel(MSG_DEBUG)

    #####################################

    #print("obtaining DA SVM from previously trained SVM")

    feats_train2 = StringCharFeatures(fm_train_dna, DNA)
    feats_test2 = StringCharFeatures(fm_test_dna, DNA)
    kernel2 = WeightedDegreeStringKernel(feats_train, feats_train, degree)
    labels2 = BinaryLabels(label_train_dna)

    # we regularize against the previously obtained solution
    dasvm = DomainAdaptationSVM(C, kernel2, labels2, svm, 1.0)
    dasvm.train()

    out = dasvm.apply_binary(feats_test2)

    return out  #,dasvm TODO
示例#12
0
def classifier_svmlight(fm_train_dna=traindat,
                        fm_test_dna=testdat,
                        label_train_dna=label_traindat,
                        C=1.2,
                        epsilon=1e-5,
                        num_threads=1):
    from shogun import StringCharFeatures, BinaryLabels, DNA
    from shogun import WeightedDegreeStringKernel
    try:
        from shogun import SVMLight
    except ImportError:
        print('No support for SVMLight available.')
        return

    feats_train = StringCharFeatures(DNA)
    feats_train.set_features(fm_train_dna)
    feats_test = StringCharFeatures(DNA)
    feats_test.set_features(fm_test_dna)
    degree = 20

    kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree)

    labels = BinaryLabels(label_train_dna)

    svm = SVMLight(C, kernel, labels)
    svm.set_epsilon(epsilon)
    svm.parallel.set_num_threads(num_threads)
    svm.train()

    kernel.init(feats_train, feats_test)
    svm.apply().get_labels()
    return kernel
示例#13
0
def kernel_combined (fm_train_real=traindat,fm_test_real=testdat,fm_train_dna=traindna,fm_test_dna=testdna ):
	from shogun import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel
	from shogun import RealFeatures, StringCharFeatures, CombinedFeatures, DNA

	kernel=CombinedKernel()
	feats_train=CombinedFeatures()
	feats_test=CombinedFeatures()

	subkfeats_train=RealFeatures(fm_train_real)
	subkfeats_test=RealFeatures(fm_test_real)
	subkernel=GaussianKernel(10, 1.1)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	subkfeats_train=StringCharFeatures(fm_train_dna, DNA)
	subkfeats_test=StringCharFeatures(fm_test_dna, DNA)
	degree=3
	subkernel=FixedDegreeStringKernel(10, degree)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	subkfeats_train=StringCharFeatures(fm_train_dna, DNA)
	subkfeats_test=StringCharFeatures(fm_test_dna, DNA)
	subkernel=LocalAlignmentStringKernel(10)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	kernel.init(feats_train, feats_train)
	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
示例#14
0
def kernel_comm_ulong_string(fm_train_dna=traindat,
                             fm_test_dna=testdat,
                             order=3,
                             gap=0,
                             reverse=False):

    from shogun import CommUlongStringKernel
    from shogun import StringUlongFeatures, StringCharFeatures, DNA
    from shogun import SortUlongString

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_train_dna)
    feats_train = StringUlongFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    preproc = SortUlongString()
    preproc.init(feats_train)
    feats_train.add_preprocessor(preproc)
    feats_train.apply_preprocessor()

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_test_dna)
    feats_test = StringUlongFeatures(charfeat.get_alphabet())
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    feats_test.add_preprocessor(preproc)
    feats_test.apply_preprocessor()

    use_sign = False

    kernel = CommUlongStringKernel(feats_train, feats_train, use_sign)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
示例#15
0
def kernel_top(fm_train_dna=traindat,
               fm_test_dna=testdat,
               label_train_dna=label_traindat,
               pseudo=1e-1,
               order=1,
               gap=0,
               reverse=False,
               kargs=[1, False, True]):
    from shogun import StringCharFeatures, StringWordFeatures, TOPFeatures, DNA
    from shogun import PolyKernel
    from shogun import HMM, BW_NORMAL

    N = 1  # toy HMM with 1 state
    M = 4  # 4 observations -> DNA

    # train HMM for positive class
    charfeat = StringCharFeatures(fm_hmm_pos, DNA)
    hmm_pos_train = StringWordFeatures(charfeat.get_alphabet())
    hmm_pos_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    pos = HMM(hmm_pos_train, N, M, pseudo)
    pos.baum_welch_viterbi_train(BW_NORMAL)

    # train HMM for negative class
    charfeat = StringCharFeatures(fm_hmm_neg, DNA)
    hmm_neg_train = StringWordFeatures(charfeat.get_alphabet())
    hmm_neg_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    neg = HMM(hmm_neg_train, N, M, pseudo)
    neg.baum_welch_viterbi_train(BW_NORMAL)

    # Kernel training data
    charfeat = StringCharFeatures(fm_train_dna, DNA)
    wordfeats_train = StringWordFeatures(charfeat.get_alphabet())
    wordfeats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    # Kernel testing data
    charfeat = StringCharFeatures(fm_test_dna, DNA)
    wordfeats_test = StringWordFeatures(charfeat.get_alphabet())
    wordfeats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    # get kernel on training data
    pos.set_observations(wordfeats_train)
    neg.set_observations(wordfeats_train)
    feats_train = TOPFeatures(10, pos, neg, False, False)
    kernel = PolyKernel(feats_train, feats_train, *kargs)
    km_train = kernel.get_kernel_matrix()

    # get kernel on testing data
    pos_clone = HMM(pos)
    neg_clone = HMM(neg)
    pos_clone.set_observations(wordfeats_test)
    neg_clone.set_observations(wordfeats_test)
    feats_test = TOPFeatures(10, pos_clone, neg_clone, False, False)
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
示例#16
0
def kernel_fisher (fm_train_dna=traindat, fm_test_dna=testdat,
		label_train_dna=label_traindat,
		N=1,M=4,pseudo=1e-1,order=1,gap=0,reverse=False,
		kargs=[1,False,True]):

	from shogun import StringCharFeatures, StringWordFeatures, FKFeatures, DNA
	from shogun import PolyKernel
	from shogun import HMM, BW_NORMAL#, MSG_DEBUG

	# train HMM for positive class
	charfeat=StringCharFeatures(fm_hmm_pos, DNA)
	#charfeat.io.set_loglevel(MSG_DEBUG)
	hmm_pos_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	pos=HMM(hmm_pos_train, N, M, pseudo)
	pos.baum_welch_viterbi_train(BW_NORMAL)

	# train HMM for negative class
	charfeat=StringCharFeatures(fm_hmm_neg, DNA)
	hmm_neg_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	neg=HMM(hmm_neg_train, N, M, pseudo)
	neg.baum_welch_viterbi_train(BW_NORMAL)

	# Kernel training data
	charfeat=StringCharFeatures(fm_train_dna, DNA)
	wordfeats_train=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	# Kernel testing data
	charfeat=StringCharFeatures(fm_test_dna, DNA)
	wordfeats_test=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	# get kernel on training data
	pos.set_observations(wordfeats_train)
	neg.set_observations(wordfeats_train)
	feats_train=FKFeatures(10, pos, neg)
	feats_train.set_opt_a(-1) #estimate prior
	kernel=PolyKernel(feats_train, feats_train, *kargs)
	km_train=kernel.get_kernel_matrix()

	# get kernel on testing data
	pos_clone=HMM(pos)
	neg_clone=HMM(neg)
	pos_clone.set_observations(wordfeats_test)
	neg_clone.set_observations(wordfeats_test)
	feats_test=FKFeatures(10, pos_clone, neg_clone)
	feats_test.set_a(feats_train.get_a()) #use prior from training data
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def get_wd_features(data, feat_type="dna"):
    """
    create feature object for wdk
    """
    if feat_type == "dna":
        feat = StringCharFeatures(DNA)
    elif feat_type == "protein":
        feat = StringCharFeatures(PROTEIN)
    else:
        raise Exception("unknown feature type")
    feat.set_features(data)

    return feat
示例#18
0
def kernel_linear_string(fm_train_dna=traindat, fm_test_dna=testdat):
    from shogun import StringCharFeatures, DNA
    from shogun import LinearStringKernel

    feats_train = StringCharFeatures(fm_train_dna, DNA)
    feats_test = StringCharFeatures(fm_test_dna, DNA)

    kernel = LinearStringKernel(feats_train, feats_train)
    km_train = kernel.get_kernel_matrix()

    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
示例#19
0
 def predict(self, data):
     from shogun import SNP, RAWBYTE
     from shogun import StringCharFeatures
     if self.isSNP:
         feats_test = StringCharFeatures(data, SNP)
     else:
         feats_test = StringCharFeatures(data, RAWBYTE)
     # 将测试string数据转化为中间量
     self.kernel.init(self.feats_train, feats_test)
     feats_test = self.kernel.get_kernel_matrix()
     result = self.clf.predict(feats_test.T)
     print ' '.join(map(str, result))
     return result
def kernel_fixed_degree_string (fm_train_dna=traindat, fm_test_dna=testdat,degree=3):
	from shogun import StringCharFeatures, DNA
	from shogun import FixedDegreeStringKernel

	feats_train=StringCharFeatures(fm_train_dna, DNA)
	feats_test=StringCharFeatures(fm_test_dna, DNA)

	kernel=FixedDegreeStringKernel(feats_train, feats_train, degree)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

	return km_train,km_test,kernel
示例#21
0
def distribution_hmm(fm_cube, N, M, pseudo, order, gap, reverse, num_examples):
    from shogun import StringWordFeatures, StringCharFeatures, CUBE
    from shogun import HMM, BW_NORMAL

    charfeat = StringCharFeatures(CUBE)
    charfeat.set_features(fm_cube)
    feats = StringWordFeatures(charfeat.get_alphabet())
    feats.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    hmm = HMM(feats, N, M, pseudo)
    hmm.train()
    hmm.baum_welch_viterbi_train(BW_NORMAL)

    num_examples = feats.get_num_vectors()
    num_param = hmm.get_num_model_parameters()
    for i in range(num_examples):
        for j in range(num_param):
            hmm.get_log_derivative(j, i)

    best_path = 0
    best_path_state = 0
    for i in range(num_examples):
        best_path += hmm.best_path(i)
        for j in range(N):
            best_path_state += hmm.get_best_path_state(i, j)

    lik_example = hmm.get_log_likelihood()
    lik_sample = hmm.get_log_likelihood_sample()

    return lik_example, lik_sample, hmm
示例#22
0
def distribution_linearhmm(fm_dna=traindna, order=3, gap=0, reverse=False):

    from shogun import StringWordFeatures, StringCharFeatures, DNA
    from shogun import LinearHMM

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_dna)
    feats = StringWordFeatures(charfeat.get_alphabet())
    feats.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    hmm = LinearHMM(feats)
    hmm.train()

    hmm.get_transition_probs()

    num_examples = feats.get_num_vectors()
    num_param = hmm.get_num_model_parameters()
    for i in range(num_examples):
        for j in range(num_param):
            hmm.get_log_derivative(j, i)

    out_likelihood = hmm.get_log_likelihood()
    out_sample = hmm.get_log_likelihood_sample()

    return hmm, out_likelihood, out_sample
示例#23
0
def features_string_file(directory, fname):
    from shogun import StringCharFeatures, RAWBYTE
    from shogun import CSVFile

    # load features from directory
    f = StringCharFeatures(RAWBYTE)
    f.load_from_directory(directory)

    #and output several stats
    #print("max string length", f.get_max_vector_length())
    #print("number of strings", f.get_num_vectors())
    #print("length of first string", f.get_vector_length(0))
    #print("str[0,0:3]", f.get_feature(0,0), f.get_feature(0,1), f.get_feature(0,2))
    #print("len(str[0])", f.get_vector_length(0))
    #print("str[0]", f.get_feature_vector(0))

    #or load features from file (one string per line)
    fil = CSVFile(fname)
    f.load(fil)
    #print(f.get_features())

    #or load fasta file
    #f.load_fasta('fasta.fa')
    #print(f.get_features())
    return f.get_features(), f
示例#24
0
    def init_sensor(self, kernel, svs):
        f = StringCharFeatures(svs, DNA)

        kname = kernel['name']
        if  kname == 'spectrum':
            wf = StringWordFeatures(f.get_alphabet())
            wf.obtain_from_char(f, kernel['order'] - 1, kernel['order'], 0, False)

            pre = SortWordString()
            pre.init(wf)
            wf.add_preprocessor(pre)
            wf.apply_preprocessor()
            f = wf

            k = CommWordStringKernel(0, False)
            k.set_use_dict_diagonal_optimization(kernel['order'] < 8)
            self.preproc = pre

        elif kname == 'wdshift':
                k = WeightedDegreePositionStringKernel(0, kernel['order'])
                k.set_normalizer(IdentityKernelNormalizer())
                k.set_shifts(kernel['shift'] *
                        numpy.ones(f.get_max_vector_length(), dtype=numpy.int32))
                k.set_position_weights(1.0 / f.get_max_vector_length() *
                        numpy.ones(f.get_max_vector_length(), dtype=numpy.float64))
        else:
            raise "Currently, only wdshift and spectrum kernels supported"

        self.kernel = k
        self.train_features = f

        return (self.kernel, self.train_features)
def create_distance_matrix(full_essays, ids):
    string_features = StringCharFeatures(full_essays, RAWBYTE)
    sk = SubsequenceStringKernel(string_features, string_features, 3, 0.5)
    sk_matrix = sk.get_kernel_matrix()
    sk_df = pd.DataFrame(sk_matrix)
    sk_df.columns = ['id_' + str(i) for i in ids]
    return (sk_df)
示例#26
0
	def get_predictions_from_seqdict(self, seqdic, site):
		""" we need to generate a huge test features object
			containing all locations found in each seqdict-sequence
			and each location (this is necessary to efficiently
			(==fast,low memory) compute the splice outputs
		"""

		seqlen=self.window_right+self.window_left+2

		for s in seqdic:
			position_list=DynamicIntArray()

			sequence=s.seq
			positions=s.preds[site].positions
			for j in xrange(len(positions)):
				i=positions[j] - self.offset -self.window_left
				position_list.append_element(i)

			t=StringCharFeatures([sequence], DNA)
			t.obtain_by_position_list(seqlen, position_list)
			self.wd_kernel.init(self.traindat, t)

			self.wd_kernel.io.enable_progress()
			l=self.svm.apply().get_values()
			self.wd_kernel.cleanup()
			sys.stdout.write("\n...done...\n")

			num=len(s.preds[site].positions)
			scores= num * [0]
			for j in xrange(num):
				scores[j]=l[j]
			s.preds[site].set_scores(scores)
示例#27
0
def features_hasheddocdot(strings):
    from shogun import StringCharFeatures, RAWBYTE
    from shogun import HashedDocDotFeatures
    from shogun import NGramTokenizer
    from numpy import array

    #create string features
    f = StringCharFeatures(strings, RAWBYTE)

    #set the number of bits of the target dimension
    #means a dim of size 2^5=32
    num_bits = 5

    #create the ngram tokenizer of size 8 to parse the strings
    tokenizer = NGramTokenizer(8)

    #normalize results
    normalize = True

    #create HashedDocDot features
    hddf = HashedDocDotFeatures(num_bits, f, tokenizer, normalize)

    #should expect 32
    #print('Feature space dimensionality is', hddf.get_dim_feature_space())

    #print('Self dot product of string 0', hddf.dot(0, hddf, 0))

    return hddf
示例#28
0
def kernel_distantsegments(fm_train_dna=traindat,
                           fm_test_dna=testdat,
                           delta=5,
                           theta=5):
    from shogun import StringCharFeatures, DNA
    from shogun import DistantSegmentsKernel

    feats_train = StringCharFeatures(fm_train_dna, DNA)
    feats_test = StringCharFeatures(fm_test_dna, DNA)

    kernel = DistantSegmentsKernel(feats_train, feats_train, 10, delta, theta)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()

    return km_train, km_test, kernel
示例#29
0
 def score(self, data, label):
     from shogun import SNP, RAWBYTE
     from shogun import StringCharFeatures
     from sklearn.metrics import accuracy_score
     from sklearn.metrics import f1_score
     if self.isSNP:
         feats_test = StringCharFeatures(data, SNP)
     else:
         feats_test = StringCharFeatures(data, RAWBYTE)
     # 将测试string数据转化为中间量
     self.kernel.init(self.feats_train, feats_test)
     feats_test = self.kernel.get_kernel_matrix()
     retult = self.clf.predict(feats_test.T)
     acc = accuracy_score(label, retult)
     f1 = f1_score(label, retult, average='macro')
     print '正确率是:' + str(acc), 'F1得分是:' + str(f1)
     return acc, f1
示例#30
0
def kernel_poly_match_string(fm_train_dna=traindat,
                             fm_test_dna=testdat,
                             degree=3,
                             inhomogene=False):
    from shogun import PolyMatchStringKernel
    from shogun import StringCharFeatures, DNA

    feats_train = StringCharFeatures(fm_train_dna, DNA)
    feats_test = StringCharFeatures(fm_train_dna, DNA)

    kernel = PolyMatchStringKernel(feats_train, feats_train, degree,
                                   inhomogene)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel