def distance_hammingword_modular (fm_train_dna=traindna,fm_test_dna=testdna,
		fm_test_real=testdat,order=3,gap=0,reverse=False,use_sign=False):

	from modshogun import StringCharFeatures, StringWordFeatures, DNA
	from modshogun import SortWordString
	from modshogun import HammingWordDistance

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	distance=HammingWordDistance(feats_train, feats_train, use_sign)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return distance,dm_train,dm_test
예제 #2
0
def preprocessor_sortwordstring_modular(fm_train_dna=traindna,
                                        fm_test_dna=testdna,
                                        order=3,
                                        gap=0,
                                        reverse=False,
                                        use_sign=False):

    from modshogun import CommWordStringKernel
    from modshogun import StringCharFeatures, StringWordFeatures, DNA
    from modshogun import SortWordString

    charfeat = StringCharFeatures(fm_train_dna, DNA)
    feats_train = StringWordFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    preproc = SortWordString()
    preproc.init(feats_train)
    feats_train.add_preprocessor(preproc)
    feats_train.apply_preprocessor()

    charfeat = StringCharFeatures(fm_test_dna, DNA)
    feats_test = StringWordFeatures(charfeat.get_alphabet())
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    feats_test.add_preprocessor(preproc)
    feats_test.apply_preprocessor()

    kernel = CommWordStringKernel(feats_train, feats_train, use_sign)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()

    return km_train, km_test, kernel
def distance_manhattenword_modular(train_fname=traindna,
                                   test_fname=testdna,
                                   order=3,
                                   gap=0,
                                   reverse=False):
    from modshogun import StringCharFeatures, StringWordFeatures, DNA
    from modshogun import SortWordString, ManhattanWordDistance, CSVFile

    charfeat = StringCharFeatures(CSVFile(train_fname), DNA)
    feats_train = StringWordFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    preproc = SortWordString()
    preproc.init(feats_train)
    feats_train.add_preprocessor(preproc)
    feats_train.apply_preprocessor()

    charfeat = StringCharFeatures(CSVFile(test_fname), DNA)
    feats_test = StringWordFeatures(charfeat.get_alphabet())
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    feats_test.add_preprocessor(preproc)
    feats_test.apply_preprocessor()

    distance = ManhattanWordDistance(feats_train, feats_train)

    dm_train = distance.get_distance_matrix()
    distance.init(feats_train, feats_test)
    dm_test = distance.get_distance_matrix()
    return dm_train, dm_test
def kernel_comm_ulong_string_modular(fm_train_dna=traindat,
                                     fm_test_dna=testdat,
                                     order=3,
                                     gap=0,
                                     reverse=False):

    from modshogun import CommUlongStringKernel
    from modshogun import StringUlongFeatures, StringCharFeatures, DNA
    from modshogun import SortUlongString

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_train_dna)
    feats_train = StringUlongFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    preproc = SortUlongString()
    preproc.init(feats_train)
    feats_train.add_preprocessor(preproc)
    feats_train.apply_preprocessor()

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_test_dna)
    feats_test = StringUlongFeatures(charfeat.get_alphabet())
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    feats_test.add_preprocessor(preproc)
    feats_test.apply_preprocessor()

    use_sign = False

    kernel = CommUlongStringKernel(feats_train, feats_train, use_sign)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
def kernel_histogram_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,order=3,gap=0,reverse=False):

	from modshogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels
	from modshogun import HistogramWordStringKernel
	from modshogun import PluginEstimate#, MSG_DEBUG

	reverse = reverse
	charfeat=StringCharFeatures(DNA)
	#charfeat.io.set_loglevel(MSG_DEBUG)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	pie=PluginEstimate()
	labels=BinaryLabels(label_train_dna)
	pie.set_labels(labels)
	pie.set_features(feats_train)
	pie.train()

	kernel=HistogramWordStringKernel(feats_train, feats_train, pie)
	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	pie.set_features(feats_test)
	pie.apply().get_labels()
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def kernel_comm_ulong_string_modular (fm_train_dna=traindat,fm_test_dna=testdat, order=3, gap=0, reverse = False):

	from modshogun import CommUlongStringKernel
	from modshogun import StringUlongFeatures, StringCharFeatures, DNA
	from modshogun import SortUlongString

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringUlongFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortUlongString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()


	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringUlongFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	use_sign=False

	kernel=CommUlongStringKernel(feats_train, feats_train, use_sign)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def distance_canberraword_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False):
	from modshogun import StringCharFeatures, StringWordFeatures, DNA
	from modshogun import SortWordString
	from modshogun import CanberraWordDistance

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	distance=CanberraWordDistance(feats_train, feats_train)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return distance,dm_train,dm_test
def kernel_salzberg_word_string_modular(fm_train_dna=traindat,
                                        fm_test_dna=testdat,
                                        label_train_dna=label_traindat,
                                        order=3,
                                        gap=0,
                                        reverse=False):
    from modshogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels
    from modshogun import SalzbergWordStringKernel
    from modshogun import PluginEstimate

    charfeat = StringCharFeatures(fm_train_dna, DNA)
    feats_train = StringWordFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    charfeat = StringCharFeatures(fm_test_dna, DNA)
    feats_test = StringWordFeatures(charfeat.get_alphabet())
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    pie = PluginEstimate()
    labels = BinaryLabels(label_train_dna)
    pie.set_labels(labels)
    pie.set_features(feats_train)
    pie.train()

    kernel = SalzbergWordStringKernel(feats_train, feats_train, pie, labels)
    km_train = kernel.get_kernel_matrix()

    kernel.init(feats_train, feats_test)
    pie.set_features(feats_test)
    pie.apply().get_labels()
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
def kernel_histogram_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,order=3,ppseudo_count=1,npseudo_count=1):

	from modshogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels
	from modshogun import HistogramWordStringKernel, AvgDiagKernelNormalizer
	from modshogun import PluginEstimate#, MSG_DEBUG

	charfeat=StringCharFeatures(DNA)
	#charfeat.io.set_loglevel(MSG_DEBUG)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, 0, False)

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, 0, False)

	pie=PluginEstimate(ppseudo_count,npseudo_count)
	labels=BinaryLabels(label_train_dna)
	pie.set_labels(labels)
	pie.set_features(feats_train)
	pie.train()

	kernel=HistogramWordStringKernel(feats_train, feats_train, pie)
	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	pie.set_features(feats_test)
	pie.apply().get_labels()
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
예제 #10
0
def kernel_fisher_modular(fm_train_dna=traindat,
                          fm_test_dna=testdat,
                          label_train_dna=label_traindat,
                          N=1,
                          M=4,
                          pseudo=1e-1,
                          order=1,
                          gap=0,
                          reverse=False,
                          kargs=[1, False, True]):

    from modshogun import StringCharFeatures, StringWordFeatures, FKFeatures, DNA
    from modshogun import PolyKernel
    from modshogun import HMM, BW_NORMAL  #, MSG_DEBUG

    # train HMM for positive class
    charfeat = StringCharFeatures(fm_hmm_pos, DNA)
    #charfeat.io.set_loglevel(MSG_DEBUG)
    hmm_pos_train = StringWordFeatures(charfeat.get_alphabet())
    hmm_pos_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    pos = HMM(hmm_pos_train, N, M, pseudo)
    pos.baum_welch_viterbi_train(BW_NORMAL)

    # train HMM for negative class
    charfeat = StringCharFeatures(fm_hmm_neg, DNA)
    hmm_neg_train = StringWordFeatures(charfeat.get_alphabet())
    hmm_neg_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    neg = HMM(hmm_neg_train, N, M, pseudo)
    neg.baum_welch_viterbi_train(BW_NORMAL)

    # Kernel training data
    charfeat = StringCharFeatures(fm_train_dna, DNA)
    wordfeats_train = StringWordFeatures(charfeat.get_alphabet())
    wordfeats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    # Kernel testing data
    charfeat = StringCharFeatures(fm_test_dna, DNA)
    wordfeats_test = StringWordFeatures(charfeat.get_alphabet())
    wordfeats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    # get kernel on training data
    pos.set_observations(wordfeats_train)
    neg.set_observations(wordfeats_train)
    feats_train = FKFeatures(10, pos, neg)
    feats_train.set_opt_a(-1)  #estimate prior
    kernel = PolyKernel(feats_train, feats_train, *kargs)
    km_train = kernel.get_kernel_matrix()

    # get kernel on testing data
    pos_clone = HMM(pos)
    neg_clone = HMM(neg)
    pos_clone.set_observations(wordfeats_test)
    neg_clone.set_observations(wordfeats_test)
    feats_test = FKFeatures(10, pos_clone, neg_clone)
    feats_test.set_a(feats_train.get_a())  #use prior from training data
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
예제 #11
0
def kernel_top_modular(fm_train_dna=traindat,
                       fm_test_dna=testdat,
                       label_train_dna=label_traindat,
                       pseudo=1e-1,
                       order=1,
                       gap=0,
                       reverse=False,
                       kargs=[1, False, True]):
    from modshogun import StringCharFeatures, StringWordFeatures, TOPFeatures, DNA
    from modshogun import PolyKernel
    from modshogun import HMM, BW_NORMAL

    N = 1  # toy HMM with 1 state
    M = 4  # 4 observations -> DNA

    # train HMM for positive class
    charfeat = StringCharFeatures(fm_hmm_pos, DNA)
    hmm_pos_train = StringWordFeatures(charfeat.get_alphabet())
    hmm_pos_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    pos = HMM(hmm_pos_train, N, M, pseudo)
    pos.baum_welch_viterbi_train(BW_NORMAL)

    # train HMM for negative class
    charfeat = StringCharFeatures(fm_hmm_neg, DNA)
    hmm_neg_train = StringWordFeatures(charfeat.get_alphabet())
    hmm_neg_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    neg = HMM(hmm_neg_train, N, M, pseudo)
    neg.baum_welch_viterbi_train(BW_NORMAL)

    # Kernel training data
    charfeat = StringCharFeatures(fm_train_dna, DNA)
    wordfeats_train = StringWordFeatures(charfeat.get_alphabet())
    wordfeats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    # Kernel testing data
    charfeat = StringCharFeatures(fm_test_dna, DNA)
    wordfeats_test = StringWordFeatures(charfeat.get_alphabet())
    wordfeats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    # get kernel on training data
    pos.set_observations(wordfeats_train)
    neg.set_observations(wordfeats_train)
    feats_train = TOPFeatures(10, pos, neg, False, False)
    kernel = PolyKernel(feats_train, feats_train, *kargs)
    km_train = kernel.get_kernel_matrix()

    # get kernel on testing data
    pos_clone = HMM(pos)
    neg_clone = HMM(neg)
    pos_clone.set_observations(wordfeats_test)
    neg_clone.set_observations(wordfeats_test)
    feats_test = TOPFeatures(10, pos_clone, neg_clone, False, False)
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
예제 #12
0
def get_kernel_mat(fm_train_dna,
                   fm_test_dna,
                   N,
                   M,
                   pseudo=1e-1,
                   order=1,
                   gap=0,
                   reverse=False):

    # train HMM for positive class
    print "hmm training"
    charfeat = StringCharFeatures(fm_train_dna, DNA)
    #charfeat.io.set_loglevel(MSG_DEBUG)
    hmm_train = StringWordFeatures(charfeat.get_alphabet())
    hmm_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    pos = HMM(hmm_train, N, M, pseudo)
    pos.baum_welch_viterbi_train(BW_NORMAL)
    neg = HMM(pos)

    print "Kernel training data"
    charfeat = StringCharFeatures(fm_train_dna, DNA)
    wordfeats_train = StringWordFeatures(charfeat.get_alphabet())
    wordfeats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    print "Kernel testing data"
    charfeat = StringCharFeatures(fm_test_dna, DNA)
    wordfeats_test = StringWordFeatures(charfeat.get_alphabet())
    wordfeats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    print "get kernel on training data"
    pos.set_observations(wordfeats_train)
    neg.set_observations(wordfeats_train)
    feats_train = FKFeatures(10, pos, neg)
    feats_train.set_opt_a(-1)  #estimate prior

    print 'getting feature matrix'
    v0 = feats_train.get_feature_vector(0)
    v1 = feats_train.get_feature_vector(1)
    print np.dot(v0, v1)
    kernel = LinearKernel(feats_train, feats_train)
    #kernel=PolyKernel(feats_train, feats_train, *kargs)
    km_train = kernel.get_kernel_matrix()
    print km_train.shape, km_train[0, 1]

    print "get kernel on testing data"
    pos_clone = HMM(pos)
    neg_clone = HMM(neg)
    pos_clone.set_observations(wordfeats_test)
    neg_clone.set_observations(wordfeats_test)
    feats_test = FKFeatures(10, pos_clone, neg_clone)
    feats_test.set_a(feats_train.get_a())  #use prior from training data
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
def kernel_fisher_modular (fm_train_dna=traindat, fm_test_dna=testdat,
		label_train_dna=label_traindat,
		N=1,M=4,pseudo=1e-1,order=1,gap=0,reverse=False,
		kargs=[1,False,True]):

	from modshogun import StringCharFeatures, StringWordFeatures, FKFeatures, DNA
	from modshogun import PolyKernel
	from modshogun import HMM, BW_NORMAL#, MSG_DEBUG

	# train HMM for positive class
	charfeat=StringCharFeatures(fm_hmm_pos, DNA)
	#charfeat.io.set_loglevel(MSG_DEBUG)
	hmm_pos_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	pos=HMM(hmm_pos_train, N, M, pseudo)
	pos.baum_welch_viterbi_train(BW_NORMAL)

	# train HMM for negative class
	charfeat=StringCharFeatures(fm_hmm_neg, DNA)
	hmm_neg_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	neg=HMM(hmm_neg_train, N, M, pseudo)
	neg.baum_welch_viterbi_train(BW_NORMAL)

	# Kernel training data
	charfeat=StringCharFeatures(fm_train_dna, DNA)
	wordfeats_train=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	# Kernel testing data
	charfeat=StringCharFeatures(fm_test_dna, DNA)
	wordfeats_test=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	# get kernel on training data
	pos.set_observations(wordfeats_train)
	neg.set_observations(wordfeats_train)
	feats_train=FKFeatures(10, pos, neg)
	feats_train.set_opt_a(-1) #estimate prior
	kernel=PolyKernel(feats_train, feats_train, *kargs)
	km_train=kernel.get_kernel_matrix()

	# get kernel on testing data
	pos_clone=HMM(pos)
	neg_clone=HMM(neg)
	pos_clone.set_observations(wordfeats_test)
	neg_clone.set_observations(wordfeats_test)
	feats_test=FKFeatures(10, pos_clone, neg_clone)
	feats_test.set_a(feats_train.get_a()) #use prior from training data
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
예제 #14
0
def kernel_top_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,pseudo=1e-1,
	order=1,gap=0,reverse=False,kargs=[1, False, True]):
	from modshogun import StringCharFeatures, StringWordFeatures, TOPFeatures, DNA
	from modshogun import PolyKernel
	from modshogun import HMM, BW_NORMAL

	N=1 # toy HMM with 1 state
	M=4 # 4 observations -> DNA


	# train HMM for positive class
	charfeat=StringCharFeatures(fm_hmm_pos, DNA)
	hmm_pos_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	pos=HMM(hmm_pos_train, N, M, pseudo)
	pos.baum_welch_viterbi_train(BW_NORMAL)

	# train HMM for negative class
	charfeat=StringCharFeatures(fm_hmm_neg, DNA)
	hmm_neg_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	neg=HMM(hmm_neg_train, N, M, pseudo)
	neg.baum_welch_viterbi_train(BW_NORMAL)

	# Kernel training data
	charfeat=StringCharFeatures(fm_train_dna, DNA)
	wordfeats_train=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	# Kernel testing data
	charfeat=StringCharFeatures(fm_test_dna, DNA)
	wordfeats_test=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	# get kernel on training data
	pos.set_observations(wordfeats_train)
	neg.set_observations(wordfeats_train)
	feats_train=TOPFeatures(10, pos, neg, False, False)
	kernel=PolyKernel(feats_train, feats_train, *kargs)
	km_train=kernel.get_kernel_matrix()

	# get kernel on testing data
	pos_clone=HMM(pos)
	neg_clone=HMM(neg)
	pos_clone.set_observations(wordfeats_test)
	neg_clone.set_observations(wordfeats_test)
	feats_test=TOPFeatures(10, pos_clone, neg_clone, False, False)
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
예제 #15
0
def get_kernel_mat(fm_train_dna, fm_test_dna, N, M,
		pseudo=1e-1,order=1,gap=0,reverse=False):

	# train HMM for positive class
	print "hmm training"
	charfeat=StringCharFeatures(fm_train_dna, DNA)
	#charfeat.io.set_loglevel(MSG_DEBUG)
	hmm_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	pos=HMM(hmm_train, N, M, pseudo)
	pos.baum_welch_viterbi_train(BW_NORMAL)
	neg = HMM(pos)

	print "Kernel training data"
	charfeat=StringCharFeatures(fm_train_dna, DNA)
	wordfeats_train=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	print "Kernel testing data"
	charfeat=StringCharFeatures(fm_test_dna, DNA)
	wordfeats_test=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	print "get kernel on training data"
	pos.set_observations(wordfeats_train)
	neg.set_observations(wordfeats_train)
	feats_train=FKFeatures(10, pos, neg)
	feats_train.set_opt_a(-1) #estimate prior
	
	print 'getting feature matrix'
	v0 = feats_train.get_feature_vector(0)
	v1 = feats_train.get_feature_vector(1)
	print np.dot(v0, v1)
	kernel=LinearKernel(feats_train, feats_train)
	#kernel=PolyKernel(feats_train, feats_train, *kargs)
	km_train=kernel.get_kernel_matrix()
	print km_train.shape, km_train[0, 1]

	print "get kernel on testing data"
	pos_clone=HMM(pos)
	neg_clone=HMM(neg)
	pos_clone.set_observations(wordfeats_test)
	neg_clone.set_observations(wordfeats_test)
	feats_test=FKFeatures(10, pos_clone, neg_clone)
	feats_test.set_a(feats_train.get_a()) #use prior from training data
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
예제 #16
0
def get_feature_mat(fm_train_dna, fm_test_dna, N, M,
		pseudo=1e-1,order=1,gap=0,reverse=False):

	# train HMM for positive class
	print "hmm training"
	charfeat=StringCharFeatures(fm_train_dna, DNA)
	#charfeat.io.set_loglevel(MSG_DEBUG)
	hmm_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	pos=HMM(hmm_train, N, M, pseudo)
	pos.baum_welch_viterbi_train(BW_NORMAL)
	neg = HMM(pos)

	print "Kernel training data"
	charfeat=StringCharFeatures(fm_train_dna, DNA)
	wordfeats_train=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	print "Kernel testing data"
	charfeat=StringCharFeatures(fm_test_dna, DNA)
	wordfeats_test=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	print "get kernel on training data"
	pos.set_observations(wordfeats_train)
	neg.set_observations(wordfeats_train)
	feats_train=FKFeatures(10, pos, neg)
	feats_train.set_opt_a(-1) #estimate prior

	print 'getting feature train'
	train_featmat = []
	for i in range(len(fm_train_dna)):
		train_featmat.append(feats_train.get_computed_dot_feature_vector(i))
	train_featmat = np.array(train_featmat)

	print "get feature on testing"
	pos_clone=HMM(pos)
	neg_clone=HMM(neg)
	pos_clone.set_observations(wordfeats_test)
	neg_clone.set_observations(wordfeats_test)
	feats_test=FKFeatures(10, pos_clone, neg_clone)
	feats_test.set_a(feats_train.get_a()) #use prior from training data

	test_featmat = []
	for i in range(len(fm_test_dna)):
		test_featmat.append(feats_test.get_feature_vector(i))
	test_featmat = np.array(test_featmat)
	return train_featmat, test_featmat
def distribution_linearhmm_modular (fm_dna=traindna,order=3,gap=0,reverse=False):

	from modshogun import StringWordFeatures, StringCharFeatures, DNA
	from modshogun import LinearHMM

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_dna)
	feats=StringWordFeatures(charfeat.get_alphabet())
	feats.obtain_from_char(charfeat, order-1, order, gap, reverse)

	hmm=LinearHMM(feats)
	hmm.train()

	hmm.get_transition_probs()

	num_examples=feats.get_num_vectors()
	num_param=hmm.get_num_model_parameters()
	for i in range(num_examples):
		for j in range(num_param):
			hmm.get_log_derivative(j, i)

	out_likelihood = hmm.get_log_likelihood()
	out_sample = hmm.get_log_likelihood_sample()

	return hmm,out_likelihood ,out_sample
예제 #18
0
def distribution_linearhmm_modular(fm_dna=traindna,
                                   order=3,
                                   gap=0,
                                   reverse=False):

    from modshogun import StringWordFeatures, StringCharFeatures, DNA
    from modshogun import LinearHMM

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_dna)
    feats = StringWordFeatures(charfeat.get_alphabet())
    feats.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    hmm = LinearHMM(feats)
    hmm.train()

    hmm.get_transition_probs()

    num_examples = feats.get_num_vectors()
    num_param = hmm.get_num_model_parameters()
    for i in range(num_examples):
        for j in range(num_param):
            hmm.get_log_derivative(j, i)

    out_likelihood = hmm.get_log_likelihood()
    out_sample = hmm.get_log_likelihood_sample()

    return hmm, out_likelihood, out_sample
예제 #19
0
def distribution_hmm_modular(fm_cube, N, M, pseudo, order, gap, reverse, num_examples):
	from modshogun import StringWordFeatures, StringCharFeatures, CUBE
	from modshogun import HMM, BW_NORMAL

	charfeat=StringCharFeatures(CUBE)
	charfeat.set_features(fm_cube)
	feats=StringWordFeatures(charfeat.get_alphabet())
	feats.obtain_from_char(charfeat, order-1, order, gap, reverse)

	hmm=HMM(feats, N, M, pseudo)
	hmm.train()
	hmm.baum_welch_viterbi_train(BW_NORMAL)

	num_examples=feats.get_num_vectors()
	num_param=hmm.get_num_model_parameters()
	for i in range(num_examples):
		for j in range(num_param):
			hmm.get_log_derivative(j, i)

	best_path=0
	best_path_state=0
	for i in range(num_examples):
		best_path+=hmm.best_path(i)
		for j in range(N):
			best_path_state+=hmm.get_best_path_state(i, j)

	lik_example = hmm.get_log_likelihood()
	lik_sample = hmm.get_log_likelihood_sample()

	return lik_example, lik_sample, hmm
def get_spectrum_features(data, order=3, gap=0, reverse=True):
    """
    create feature object used by spectrum kernel
    """

    charfeat = StringCharFeatures(data, DNA)
    feat = StringWordFeatures(charfeat.get_alphabet())
    feat.obtain_from_char(charfeat, order-1, order, gap, reverse)
    preproc = SortWordString()
    preproc.init(feat)
    feat.add_preprocessor(preproc)
    feat.apply_preprocessor()

    return feat
def get_spectrum_features(data, order=3, gap=0, reverse=True):
    """
	create feature object used by spectrum kernel
	"""

    charfeat = StringCharFeatures(data, PROTEIN)
    feat = StringWordFeatures(charfeat.get_alphabet())
    feat.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    preproc = SortWordString()
    preproc.init(feat)
    feat.add_preprocessor(preproc)
    feat.apply_preprocessor()

    return feat
def distance_manhattenword_modular (train_fname=traindna,test_fname=testdna,order=3,gap=0,reverse=False):
	from modshogun import StringCharFeatures, StringWordFeatures, DNA
	from modshogun import SortWordString, ManhattanWordDistance, CSVFile

	charfeat=StringCharFeatures(CSVFile(train_fname), DNA)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(CSVFile(test_fname), DNA)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	distance=ManhattanWordDistance(feats_train, feats_train)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return dm_train,dm_test
def make_string_feature (astringv, start=1, order=8, gap=0, reverse=False):
    from modshogun import StringUlongFeatures, StringCharFeatures, RAWBYTE 
    from modshogun import SortUlongString


    charfeat=StringCharFeatures(astringv, RAWBYTE)

    feats_train=StringUlongFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, start, order, gap, reverse)
    preproc=SortUlongString()
    preproc.init(feats_train)
    feats_train.add_preprocessor(preproc)
    feats_train.apply_preprocessor()

    return feats_train
예제 #24
0
def distribution_ppwm_modular(fm_dna=traindna, order=3):
    from modshogun import StringByteFeatures, StringCharFeatures, DNA
    from modshogun import PositionalPWM

    from numpy import array, e, log, exp

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_dna)
    feats = StringByteFeatures(charfeat.get_alphabet())
    feats.obtain_from_char(charfeat, order - 1, order, 0, False)

    L = 20
    k = 3
    sigma = 1
    mu = 4

    ppwm = PositionalPWM()
    ppwm.set_sigma(sigma)
    ppwm.set_mean(mu)
    pwm = array([[0.0, 0.5, 0.1, 1.0], [0.0, 0.5, 0.5, 0.0],
                 [1.0, 0.0, 0.4, 0.0], [0.0, 0.0, 0.0, 0.0]])
    pwm = array([[0.01, 0.09, 0.1], [0.09, 0.01, 0.1], [0.85, 0.4, 0.1],
                 [0.05, 0.5, 0.7]])

    ppwm.set_pwm(log(pwm))
    #print(ppwm.get_pwm())
    ppwm.compute_w(L)
    w = ppwm.get_w()
    #print(w)
    #from pylab import *
    #figure(1)
    #pcolor(exp(w))
    #pcolor(w)
    #colorbar()

    #figure(2)
    ppwm.compute_scoring(1)
    u = ppwm.get_scoring(0)
    #pcolor(exp(u))
    #show()

    #ppwm=PositionalPWM(feats)
    #ppwm.train()

    #out_likelihood = histo.get_log_likelihood()
    #out_sample = histo.get_log_likelihood_sample()
    return w, u
def distribution_ppwm_modular(fm_dna=traindna, order=3):
    from modshogun import StringByteFeatures, StringCharFeatures, DNA
    from modshogun import PositionalPWM

    from numpy import array, e, log, exp

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_dna)
    feats = StringByteFeatures(charfeat.get_alphabet())
    feats.obtain_from_char(charfeat, order - 1, order, 0, False)

    L = 20
    k = 3
    sigma = 1
    mu = 4

    ppwm = PositionalPWM()
    ppwm.set_sigma(sigma)
    ppwm.set_mean(mu)
    pwm = array([[0.0, 0.5, 0.1, 1.0], [0.0, 0.5, 0.5, 0.0], [1.0, 0.0, 0.4, 0.0], [0.0, 0.0, 0.0, 0.0]])
    pwm = array([[0.01, 0.09, 0.1], [0.09, 0.01, 0.1], [0.85, 0.4, 0.1], [0.05, 0.5, 0.7]])

    ppwm.set_pwm(log(pwm))
    # print(ppwm.get_pwm())
    ppwm.compute_w(L)
    w = ppwm.get_w()
    # print(w)
    # from pylab import *
    # figure(1)
    # pcolor(exp(w))
    # pcolor(w)
    # colorbar()

    # figure(2)
    ppwm.compute_scoring(1)
    u = ppwm.get_scoring(0)
    # pcolor(exp(u))
    # show()

    # ppwm=PositionalPWM(feats)
    # ppwm.train()

    # out_likelihood = histo.get_log_likelihood()
    # out_sample = histo.get_log_likelihood_sample()
    return w, u
def distribution_histogram_modular (fm_dna=traindna,order=3,gap=0,reverse=False):
	from modshogun import StringWordFeatures, StringCharFeatures, DNA
	from modshogun import Histogram

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_dna)
	feats=StringWordFeatures(charfeat.get_alphabet())
	feats.obtain_from_char(charfeat, order-1, order, gap, reverse)

	histo=Histogram(feats)
	histo.train()

	histo.get_histogram()

	num_examples=feats.get_num_vectors()
	num_param=histo.get_num_model_parameters()
	#for i in xrange(num_examples):
	#	for j in xrange(num_param):
	#		histo.get_log_derivative(j, i)

	out_likelihood = histo.get_log_likelihood()
	out_sample = histo.get_log_likelihood_sample()
	return histo,out_sample,out_likelihood
def distribution_histogram_modular(fm_dna=traindna, order=3, gap=0, reverse=False):
    from modshogun import StringWordFeatures, StringCharFeatures, DNA
    from modshogun import Histogram

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_dna)
    feats = StringWordFeatures(charfeat.get_alphabet())
    feats.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    histo = Histogram(feats)
    histo.train()

    histo.get_histogram()

    num_examples = feats.get_num_vectors()
    num_param = histo.get_num_model_parameters()
    # for i in xrange(num_examples):
    # 	for j in xrange(num_param):
    # 		histo.get_log_derivative(j, i)

    out_likelihood = histo.get_log_likelihood()
    out_sample = histo.get_log_likelihood_sample()
    return histo, out_sample, out_likelihood
def tests_check_commwordkernel_memleak_modular (num, order, gap, reverse):
	import gc
	from modshogun import Alphabet,StringCharFeatures,StringWordFeatures,DNA
	from modshogun import SortWordString, MSG_DEBUG
	from modshogun import CommWordStringKernel, IdentityKernelNormalizer
	from numpy import mat

	POS=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT']
	NEG=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT']

	for i in range(10):
		alpha=Alphabet(DNA)
		traindat=StringCharFeatures(alpha)
		traindat.set_features(POS+NEG)
		trainudat=StringWordFeatures(traindat.get_alphabet());
		trainudat.obtain_from_char(traindat, order-1, order, gap, reverse)
		#trainudat.io.set_loglevel(MSG_DEBUG)
		pre = SortWordString()
		#pre.io.set_loglevel(MSG_DEBUG)
		pre.init(trainudat)
		trainudat.add_preprocessor(pre)
		trainudat.apply_preprocessor()
		spec = CommWordStringKernel(10, False)
		spec.set_normalizer(IdentityKernelNormalizer())
		spec.init(trainudat, trainudat)
		K=spec.get_kernel_matrix()

	del POS
	del NEG
	del order
	del gap
	del reverse
	return K
def tests_check_commwordkernel_memleak_modular(num, order, gap, reverse):
    import gc
    from modshogun import Alphabet, StringCharFeatures, StringWordFeatures, DNA
    from modshogun import SortWordString, MSG_DEBUG
    from modshogun import CommWordStringKernel, IdentityKernelNormalizer
    from numpy import mat

    POS = [
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT'
    ]
    NEG = [
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT'
    ]

    for i in range(10):
        alpha = Alphabet(DNA)
        traindat = StringCharFeatures(alpha)
        traindat.set_features(POS + NEG)
        trainudat = StringWordFeatures(traindat.get_alphabet())
        trainudat.obtain_from_char(traindat, order - 1, order, gap, reverse)
        #trainudat.io.set_loglevel(MSG_DEBUG)
        pre = SortWordString()
        #pre.io.set_loglevel(MSG_DEBUG)
        pre.init(trainudat)
        trainudat.add_preprocessor(pre)
        trainudat.apply_preprocessor()
        spec = CommWordStringKernel(10, False)
        spec.set_normalizer(IdentityKernelNormalizer())
        spec.init(trainudat, trainudat)
        K = spec.get_kernel_matrix()

    del POS
    del NEG
    del order
    del gap
    del reverse
    return K