def kernel_histogram_word_string (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,order=3,ppseudo_count=1,npseudo_count=1):

	from shogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels
	from shogun import HistogramWordStringKernel, AvgDiagKernelNormalizer
	from shogun import PluginEstimate#, MSG_DEBUG

	charfeat=StringCharFeatures(DNA)
	#charfeat.io.set_loglevel(MSG_DEBUG)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, 0, False)

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, 0, False)

	pie=PluginEstimate(ppseudo_count,npseudo_count)
	labels=BinaryLabels(label_train_dna)
	pie.set_labels(labels)
	pie.set_features(feats_train)
	pie.train()

	kernel=HistogramWordStringKernel(feats_train, feats_train, pie)
	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	pie.set_features(feats_test)
	pie.apply().get_labels()
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
예제 #2
0
def distribution_hmm(fm_cube, N, M, pseudo, order, gap, reverse, num_examples):
    from shogun import StringWordFeatures, StringCharFeatures, CUBE
    from shogun import HMM, BW_NORMAL

    charfeat = StringCharFeatures(CUBE)
    charfeat.set_features(fm_cube)
    feats = StringWordFeatures(charfeat.get_alphabet())
    feats.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    hmm = HMM(feats, N, M, pseudo)
    hmm.train()
    hmm.baum_welch_viterbi_train(BW_NORMAL)

    num_examples = feats.get_num_vectors()
    num_param = hmm.get_num_model_parameters()
    for i in range(num_examples):
        for j in range(num_param):
            hmm.get_log_derivative(j, i)

    best_path = 0
    best_path_state = 0
    for i in range(num_examples):
        best_path += hmm.best_path(i)
        for j in range(N):
            best_path_state += hmm.get_best_path_state(i, j)

    lik_example = hmm.get_log_likelihood()
    lik_sample = hmm.get_log_likelihood_sample()

    return lik_example, lik_sample, hmm
예제 #3
0
def distribution_linearhmm(fm_dna=traindna, order=3, gap=0, reverse=False):

    from shogun import StringWordFeatures, StringCharFeatures, DNA
    from shogun import LinearHMM

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_dna)
    feats = StringWordFeatures(charfeat.get_alphabet())
    feats.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    hmm = LinearHMM(feats)
    hmm.train()

    hmm.get_transition_probs()

    num_examples = feats.get_num_vectors()
    num_param = hmm.get_num_model_parameters()
    for i in range(num_examples):
        for j in range(num_param):
            hmm.get_log_derivative(j, i)

    out_likelihood = hmm.get_log_likelihood()
    out_sample = hmm.get_log_likelihood_sample()

    return hmm, out_likelihood, out_sample
예제 #4
0
def kernel_match_word_string(fm_train_dna=traindat,
                             fm_test_dna=testdat,
                             degree=3,
                             scale=1.4,
                             size_cache=10,
                             order=3,
                             gap=0,
                             reverse=False):
    from shogun import MatchWordStringKernel, AvgDiagKernelNormalizer
    from shogun import StringWordFeatures, StringCharFeatures, DNA

    charfeat = StringCharFeatures(fm_train_dna, DNA)
    feats_train = StringWordFeatures(DNA)
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    charfeat = StringCharFeatures(fm_test_dna, DNA)
    feats_test = StringWordFeatures(DNA)
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    kernel = MatchWordStringKernel(size_cache, degree)
    kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
    kernel.init(feats_train, feats_train)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
def kernel_histogram_word_string(fm_train_dna=traindat,
                                 fm_test_dna=testdat,
                                 label_train_dna=label_traindat,
                                 order=3,
                                 ppseudo_count=1,
                                 npseudo_count=1):

    from shogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels
    from shogun import HistogramWordStringKernel, AvgDiagKernelNormalizer
    from shogun import PluginEstimate  #, MSG_DEBUG

    charfeat = StringCharFeatures(DNA)
    #charfeat.io.set_loglevel(MSG_DEBUG)
    charfeat.set_features(fm_train_dna)
    feats_train = StringWordFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, 0, False)

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_test_dna)
    feats_test = StringWordFeatures(charfeat.get_alphabet())
    feats_test.obtain_from_char(charfeat, order - 1, order, 0, False)

    pie = PluginEstimate(ppseudo_count, npseudo_count)
    labels = BinaryLabels(label_train_dna)
    pie.set_labels(labels)
    pie.set_features(feats_train)
    pie.train()

    kernel = HistogramWordStringKernel(feats_train, feats_train, pie)
    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    pie.set_features(feats_test)
    pie.apply().get_labels()
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
예제 #6
0
def distribution_hmm(fm_cube, N, M, pseudo, order, gap, reverse, num_examples):
	from shogun import StringWordFeatures, StringCharFeatures, CUBE
	from shogun import HMM, BW_NORMAL

	charfeat=StringCharFeatures(CUBE)
	charfeat.set_features(fm_cube)
	feats=StringWordFeatures(charfeat.get_alphabet())
	feats.obtain_from_char(charfeat, order-1, order, gap, reverse)

	hmm=HMM(feats, N, M, pseudo)
	hmm.train()
	hmm.baum_welch_viterbi_train(BW_NORMAL)

	num_examples=feats.get_num_vectors()
	num_param=hmm.get_num_model_parameters()
	for i in range(num_examples):
		for j in range(num_param):
			hmm.get_log_derivative(j, i)

	best_path=0
	best_path_state=0
	for i in range(num_examples):
		best_path+=hmm.best_path(i)
		for j in range(N):
			best_path_state+=hmm.get_best_path_state(i, j)

	lik_example = hmm.get_log_likelihood()
	lik_sample = hmm.get_log_likelihood_sample()

	return lik_example, lik_sample, hmm
def kernel_weighted_comm_word_string (fm_train_dna=traindat,fm_test_dna=testdat,order=3,gap=0,reverse=True ):
	from shogun import WeightedCommWordStringKernel
	from shogun import StringWordFeatures, StringCharFeatures, DNA
	from shogun import SortWordString

	charfeat=StringCharFeatures(fm_train_dna, DNA)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(fm_test_dna, DNA)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	use_sign=False
	kernel=WeightedCommWordStringKernel(feats_train, feats_train, use_sign)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
예제 #8
0
def kernel_salzberg_word_string(fm_train_dna=traindat,
                                fm_test_dna=testdat,
                                label_train_dna=label_traindat,
                                order=3,
                                gap=0,
                                reverse=False):
    from shogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels
    from shogun import SalzbergWordStringKernel
    from shogun import PluginEstimate

    charfeat = StringCharFeatures(fm_train_dna, DNA)
    feats_train = StringWordFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    charfeat = StringCharFeatures(fm_test_dna, DNA)
    feats_test = StringWordFeatures(charfeat.get_alphabet())
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    pie = PluginEstimate()
    labels = BinaryLabels(label_train_dna)
    pie.set_labels(labels)
    pie.set_features(feats_train)
    pie.train()

    kernel = SalzbergWordStringKernel(feats_train, feats_train, pie, labels)
    km_train = kernel.get_kernel_matrix()

    kernel.init(feats_train, feats_test)
    pie.set_features(feats_test)
    pie.apply().get_labels()
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
def distribution_linearhmm (fm_dna=traindna,order=3,gap=0,reverse=False):

	from shogun import StringWordFeatures, StringCharFeatures, DNA
	from shogun import LinearHMM

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_dna)
	feats=StringWordFeatures(charfeat.get_alphabet())
	feats.obtain_from_char(charfeat, order-1, order, gap, reverse)

	hmm=LinearHMM(feats)
	hmm.train()

	hmm.get_transition_probs()

	num_examples=feats.get_num_vectors()
	num_param=hmm.get_num_model_parameters()
	for i in range(num_examples):
		for j in range(num_param):
			hmm.get_log_derivative(j, i)

	out_likelihood = hmm.get_log_likelihood()
	out_sample = hmm.get_log_likelihood_sample()

	return hmm,out_likelihood ,out_sample
예제 #10
0
    def init_sensor(self, kernel, svs):
        f = StringCharFeatures(svs, DNA)

        kname = kernel['name']
        if  kname == 'spectrum':
            wf = StringWordFeatures(f.get_alphabet())
            wf.obtain_from_char(f, kernel['order'] - 1, kernel['order'], 0, False)

            pre = SortWordString()
            pre.init(wf)
            wf.add_preprocessor(pre)
            wf.apply_preprocessor()
            f = wf

            k = CommWordStringKernel(0, False)
            k.set_use_dict_diagonal_optimization(kernel['order'] < 8)
            self.preproc = pre

        elif kname == 'wdshift':
                k = WeightedDegreePositionStringKernel(0, kernel['order'])
                k.set_normalizer(IdentityKernelNormalizer())
                k.set_shifts(kernel['shift'] *
                        numpy.ones(f.get_max_vector_length(), dtype=numpy.int32))
                k.set_position_weights(1.0 / f.get_max_vector_length() *
                        numpy.ones(f.get_max_vector_length(), dtype=numpy.float64))
        else:
            raise "Currently, only wdshift and spectrum kernels supported"

        self.kernel = k
        self.train_features = f

        return (self.kernel, self.train_features)
예제 #11
0
def preprocessor_sortwordstring(fm_train_dna=traindna,
                                fm_test_dna=testdna,
                                order=3,
                                gap=0,
                                reverse=False,
                                use_sign=False):

    from shogun import CommWordStringKernel
    from shogun import StringCharFeatures, StringWordFeatures, DNA
    from shogun import SortWordString

    charfeat = StringCharFeatures(fm_train_dna, DNA)
    feats_train = StringWordFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    preproc = SortWordString()
    preproc.init(feats_train)
    feats_train.add_preprocessor(preproc)
    feats_train.apply_preprocessor()

    charfeat = StringCharFeatures(fm_test_dna, DNA)
    feats_test = StringWordFeatures(charfeat.get_alphabet())
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    feats_test.add_preprocessor(preproc)
    feats_test.apply_preprocessor()

    kernel = CommWordStringKernel(feats_train, feats_train, use_sign)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()

    return km_train, km_test, kernel
예제 #12
0
def kernel_poly_match_word_string(fm_train_dna=traindat,
                                  fm_test_dna=testdat,
                                  degree=2,
                                  inhomogene=True,
                                  order=3,
                                  gap=0,
                                  reverse=False):
    from shogun import PolyMatchWordStringKernel
    from shogun import StringWordFeatures, StringCharFeatures, DNA

    charfeat = StringCharFeatures(fm_train_dna, DNA)
    feats_train = StringWordFeatures(DNA)
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    charfeat = StringCharFeatures(fm_test_dna, DNA)
    feats_test = StringWordFeatures(DNA)
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    kernel = PolyMatchWordStringKernel(feats_train, feats_train, degree,
                                       inhomogene)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
def kernel_salzberg_word_string (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,
order=3,gap=0,reverse=False):
	from shogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels
	from shogun import SalzbergWordStringKernel
	from shogun import PluginEstimate

	charfeat=StringCharFeatures(fm_train_dna, DNA)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	charfeat=StringCharFeatures(fm_test_dna, DNA)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	pie=PluginEstimate()
	labels=BinaryLabels(label_train_dna)
	pie.set_labels(labels)
	pie.set_features(feats_train)
	pie.train()

	kernel=SalzbergWordStringKernel(feats_train, feats_train, pie, labels)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	pie.set_features(feats_test)
	pie.apply().get_labels()
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
예제 #14
0
def distance_hammingword (fm_train_dna=traindna,fm_test_dna=testdna,
		fm_test_real=testdat,order=3,gap=0,reverse=False,use_sign=False):

	from shogun import StringCharFeatures, StringWordFeatures, DNA
	from shogun import SortWordString
	from shogun import HammingWordDistance

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	distance=HammingWordDistance(feats_train, feats_train, use_sign)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return distance,dm_train,dm_test
예제 #15
0
def distance_manhattenword(train_fname=traindna,
                           test_fname=testdna,
                           order=3,
                           gap=0,
                           reverse=False):
    from shogun import StringCharFeatures, StringWordFeatures, DNA
    from shogun import SortWordString, ManhattanWordDistance, CSVFile

    charfeat = StringCharFeatures(CSVFile(train_fname), DNA)
    feats_train = StringWordFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    preproc = SortWordString()
    preproc.init(feats_train)
    feats_train.add_preprocessor(preproc)
    feats_train.apply_preprocessor()

    charfeat = StringCharFeatures(CSVFile(test_fname), DNA)
    feats_test = StringWordFeatures(charfeat.get_alphabet())
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    feats_test.add_preprocessor(preproc)
    feats_test.apply_preprocessor()

    distance = ManhattanWordDistance(feats_train, feats_train)

    dm_train = distance.get_distance_matrix()
    distance.init(feats_train, feats_test)
    dm_test = distance.get_distance_matrix()
    return dm_train, dm_test
예제 #16
0
def distance_canberraword (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False):
	from shogun import StringCharFeatures, StringWordFeatures, DNA
	from shogun import SortWordString
	from shogun import CanberraWordDistance

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	distance=CanberraWordDistance(feats_train, feats_train)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return distance,dm_train,dm_test
def get_spectrum_features(data, order=3, gap=0, reverse=True):
    """
    create feature object used by spectrum kernel
    """

    charfeat = StringCharFeatures(data, DNA)
    feat = StringWordFeatures(charfeat.get_alphabet())
    feat.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    preproc = SortWordString()
    preproc.init(feat)
    feat.add_preprocessor(preproc)
    feat.apply_preprocessor()

    return feat
예제 #18
0
def kernel_top(fm_train_dna=traindat,
               fm_test_dna=testdat,
               label_train_dna=label_traindat,
               pseudo=1e-1,
               order=1,
               gap=0,
               reverse=False,
               kargs=[1, False, True]):
    from shogun import StringCharFeatures, StringWordFeatures, TOPFeatures, DNA
    from shogun import PolyKernel
    from shogun import HMM, BW_NORMAL

    N = 1  # toy HMM with 1 state
    M = 4  # 4 observations -> DNA

    # train HMM for positive class
    charfeat = StringCharFeatures(fm_hmm_pos, DNA)
    hmm_pos_train = StringWordFeatures(charfeat.get_alphabet())
    hmm_pos_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    pos = HMM(hmm_pos_train, N, M, pseudo)
    pos.baum_welch_viterbi_train(BW_NORMAL)

    # train HMM for negative class
    charfeat = StringCharFeatures(fm_hmm_neg, DNA)
    hmm_neg_train = StringWordFeatures(charfeat.get_alphabet())
    hmm_neg_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    neg = HMM(hmm_neg_train, N, M, pseudo)
    neg.baum_welch_viterbi_train(BW_NORMAL)

    # Kernel training data
    charfeat = StringCharFeatures(fm_train_dna, DNA)
    wordfeats_train = StringWordFeatures(charfeat.get_alphabet())
    wordfeats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    # Kernel testing data
    charfeat = StringCharFeatures(fm_test_dna, DNA)
    wordfeats_test = StringWordFeatures(charfeat.get_alphabet())
    wordfeats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    # get kernel on training data
    pos.set_observations(wordfeats_train)
    neg.set_observations(wordfeats_train)
    feats_train = TOPFeatures(10, pos, neg, False, False)
    kernel = PolyKernel(feats_train, feats_train, *kargs)
    km_train = kernel.get_kernel_matrix()

    # get kernel on testing data
    pos_clone = HMM(pos)
    neg_clone = HMM(neg)
    pos_clone.set_observations(wordfeats_test)
    neg_clone.set_observations(wordfeats_test)
    feats_test = TOPFeatures(10, pos_clone, neg_clone, False, False)
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
예제 #19
0
    def get_test_features(self, seq, window):
        start = self.window[0] - window[0]
        end = len(seq) - window[1] + self.window[2]
        size = self.window[2] - self.window[0] + 1
        seq = seq[start:end]
        seq = seq.replace("N", "A").replace("R", "A").replace("M", "A")
        f = StringCharFeatures([seq], DNA)

        if self.preproc:
            wf = StringWordFeatures(f.get_alphabet())
            o = self.train_features.get_order()
            wf.obtain_from_char(f, 0, o, 0, False)
            f = wf
            f.obtain_by_sliding_window(size, 1, o - 1)
        else:
            f.obtain_by_sliding_window(size, 1)

        return f
예제 #20
0
def kernel_fisher (fm_train_dna=traindat, fm_test_dna=testdat,
		label_train_dna=label_traindat,
		N=1,M=4,pseudo=1e-1,order=1,gap=0,reverse=False,
		kargs=[1,False,True]):

	from shogun import StringCharFeatures, StringWordFeatures, FKFeatures, DNA
	from shogun import PolyKernel
	from shogun import HMM, BW_NORMAL#, MSG_DEBUG

	# train HMM for positive class
	charfeat=StringCharFeatures(fm_hmm_pos, DNA)
	#charfeat.io.set_loglevel(MSG_DEBUG)
	hmm_pos_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	pos=HMM(hmm_pos_train, N, M, pseudo)
	pos.baum_welch_viterbi_train(BW_NORMAL)

	# train HMM for negative class
	charfeat=StringCharFeatures(fm_hmm_neg, DNA)
	hmm_neg_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	neg=HMM(hmm_neg_train, N, M, pseudo)
	neg.baum_welch_viterbi_train(BW_NORMAL)

	# Kernel training data
	charfeat=StringCharFeatures(fm_train_dna, DNA)
	wordfeats_train=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	# Kernel testing data
	charfeat=StringCharFeatures(fm_test_dna, DNA)
	wordfeats_test=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	# get kernel on training data
	pos.set_observations(wordfeats_train)
	neg.set_observations(wordfeats_train)
	feats_train=FKFeatures(10, pos, neg)
	feats_train.set_opt_a(-1) #estimate prior
	kernel=PolyKernel(feats_train, feats_train, *kargs)
	km_train=kernel.get_kernel_matrix()

	# get kernel on testing data
	pos_clone=HMM(pos)
	neg_clone=HMM(neg)
	pos_clone.set_observations(wordfeats_test)
	neg_clone.set_observations(wordfeats_test)
	feats_test=FKFeatures(10, pos_clone, neg_clone)
	feats_test.set_a(feats_train.get_a()) #use prior from training data
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
예제 #21
0
def runShogunSVMDNASpectrumKernel(train_xt, train_lt, test_xt):
	"""
	run svm with spectrum kernel
	"""

    ##################################################
    # set up SVM
	charfeat_train = StringCharFeatures(train_xt, DNA)
	feats_train = StringWordFeatures(DNA)
	feats_train.obtain_from_char(charfeat_train, K-1, K, GAP, False)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()
	
	charfeat_test = StringCharFeatures(test_xt, DNA)
	feats_test=StringWordFeatures(DNA)
	feats_test.obtain_from_char(charfeat_test, K-1, K, GAP, False)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()
	
	kernel=CommWordStringKernel(feats_train, feats_train, False)
	kernel.io.set_loglevel(MSG_DEBUG)

    # init kernel
	labels = BinaryLabels(train_lt)
	
	# run svm model
	print "Ready to train!"
	svm=LibSVM(SVMC, kernel, labels)
	svm.io.set_loglevel(MSG_DEBUG)
	svm.train()

	# predictions
	print "Making predictions!"
	out1DecisionValues = svm.apply(feats_train)
	out1=out1DecisionValues.get_labels()
	kernel.init(feats_train, feats_test)
	out2DecisionValues = svm.apply(feats_test)
	out2=out2DecisionValues.get_labels()

	return out1,out2,out1DecisionValues,out2DecisionValues
def kernel_match_word_string (fm_train_dna=traindat,fm_test_dna=testdat,
degree=3,scale=1.4,size_cache=10,order=3,gap=0,reverse=False):
	from shogun import MatchWordStringKernel, AvgDiagKernelNormalizer
	from shogun import StringWordFeatures, StringCharFeatures, DNA

	charfeat=StringCharFeatures(fm_train_dna, DNA)
	feats_train=StringWordFeatures(DNA)
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	charfeat=StringCharFeatures(fm_test_dna, DNA)
	feats_test=StringWordFeatures(DNA)
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	kernel=MatchWordStringKernel(size_cache, degree)
	kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
	kernel.init(feats_train, feats_train)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
예제 #23
0
def kernel_top (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,pseudo=1e-1,
	order=1,gap=0,reverse=False,kargs=[1, False, True]):
	from shogun import StringCharFeatures, StringWordFeatures, TOPFeatures, DNA
	from shogun import PolyKernel
	from shogun import HMM, BW_NORMAL

	N=1 # toy HMM with 1 state
	M=4 # 4 observations -> DNA


	# train HMM for positive class
	charfeat=StringCharFeatures(fm_hmm_pos, DNA)
	hmm_pos_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	pos=HMM(hmm_pos_train, N, M, pseudo)
	pos.baum_welch_viterbi_train(BW_NORMAL)

	# train HMM for negative class
	charfeat=StringCharFeatures(fm_hmm_neg, DNA)
	hmm_neg_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	neg=HMM(hmm_neg_train, N, M, pseudo)
	neg.baum_welch_viterbi_train(BW_NORMAL)

	# Kernel training data
	charfeat=StringCharFeatures(fm_train_dna, DNA)
	wordfeats_train=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	# Kernel testing data
	charfeat=StringCharFeatures(fm_test_dna, DNA)
	wordfeats_test=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	# get kernel on training data
	pos.set_observations(wordfeats_train)
	neg.set_observations(wordfeats_train)
	feats_train=TOPFeatures(10, pos, neg, False, False)
	kernel=PolyKernel(feats_train, feats_train, *kargs)
	km_train=kernel.get_kernel_matrix()

	# get kernel on testing data
	pos_clone=HMM(pos)
	neg_clone=HMM(neg)
	pos_clone.set_observations(wordfeats_test)
	neg_clone.set_observations(wordfeats_test)
	feats_test=TOPFeatures(10, pos_clone, neg_clone, False, False)
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
예제 #24
0
def features_string_word(strings, start, order, gap, rev):
    from shogun import StringCharFeatures, StringWordFeatures, RAWBYTE
    from numpy import array, uint16

    #create string features
    cf = StringCharFeatures(strings, RAWBYTE)
    wf = StringWordFeatures(RAWBYTE)

    wf.obtain_from_char(cf, start, order, gap, rev)

    #and output several stats
    #print("max string length", wf.get_max_vector_length())
    #print("number of strings", wf.get_num_vectors())
    #print("length of first string", wf.get_vector_length(0))
    #print("string[2]", wf.get_feature_vector(2))
    #print("strings", wf.get_features())

    #replace string 0
    wf.set_feature_vector(array([1, 2, 3, 4, 5], dtype=uint16), 0)

    #print("strings", wf.get_features())
    return wf.get_features(), wf
예제 #25
0
def features_string_word (strings, start, order, gap, rev):
	from shogun import StringCharFeatures, StringWordFeatures, RAWBYTE
	from numpy import array, uint16

	#create string features
	cf=StringCharFeatures(strings, RAWBYTE)
	wf=StringWordFeatures(RAWBYTE)

	wf.obtain_from_char(cf, start, order, gap, rev)

	#and output several stats
	#print("max string length", wf.get_max_vector_length())
	#print("number of strings", wf.get_num_vectors())
	#print("length of first string", wf.get_vector_length(0))
	#print("string[2]", wf.get_feature_vector(2))
	#print("strings", wf.get_features())

	#replace string 0
	wf.set_feature_vector(array([1,2,3,4,5], dtype=uint16), 0)

	#print("strings", wf.get_features())
	return wf.get_features(), wf
예제 #26
0
def distribution_histogram (fm_dna=traindna,order=3,gap=0,reverse=False):
	from shogun import StringWordFeatures, StringCharFeatures, DNA
	from shogun import Histogram

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_dna)
	feats=StringWordFeatures(charfeat.get_alphabet())
	feats.obtain_from_char(charfeat, order-1, order, gap, reverse)

	histo=Histogram(feats)
	histo.train()

	histo.get_histogram()

	num_examples=feats.get_num_vectors()
	num_param=histo.get_num_model_parameters()
	#for i in xrange(num_examples):
	#	for j in xrange(num_param):
	#		histo.get_log_derivative(j, i)

	out_likelihood = histo.get_log_likelihood()
	out_sample = histo.get_log_likelihood_sample()
	return histo,out_sample,out_likelihood
예제 #27
0
def distribution_histogram(fm_dna=traindna, order=3, gap=0, reverse=False):
    from shogun import StringWordFeatures, StringCharFeatures, DNA
    from shogun import Histogram

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_dna)
    feats = StringWordFeatures(charfeat.get_alphabet())
    feats.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    histo = Histogram(feats)
    histo.train()

    histo.get_histogram()

    num_examples = feats.get_num_vectors()
    num_param = histo.get_num_model_parameters()
    #for i in xrange(num_examples):
    #	for j in xrange(num_param):
    #		histo.get_log_derivative(j, i)

    out_likelihood = histo.get_log_likelihood()
    out_sample = histo.get_log_likelihood_sample()
    return histo, out_sample, out_likelihood
예제 #28
0
def distance_manhattenword (train_fname=traindna,test_fname=testdna,order=3,gap=0,reverse=False):
	from shogun import StringCharFeatures, StringWordFeatures, DNA
	from shogun import SortWordString, ManhattanWordDistance, CSVFile

	charfeat=StringCharFeatures(CSVFile(train_fname), DNA)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(CSVFile(test_fname), DNA)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	distance=ManhattanWordDistance(feats_train, feats_train)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return dm_train,dm_test
def tests_check_commwordkernel_memleak (num, order, gap, reverse):
	import gc
	from shogun import Alphabet,StringCharFeatures,StringWordFeatures,DNA
	from shogun import SortWordString, MSG_DEBUG
	from shogun import CommWordStringKernel, IdentityKernelNormalizer
	from numpy import mat

	POS=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT']
	NEG=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT']

	for i in range(10):
		alpha=Alphabet(DNA)
		traindat=StringCharFeatures(alpha)
		traindat.set_features(POS+NEG)
		trainudat=StringWordFeatures(traindat.get_alphabet());
		trainudat.obtain_from_char(traindat, order-1, order, gap, reverse)
		#trainudat.io.set_loglevel(MSG_DEBUG)
		pre = SortWordString()
		#pre.io.set_loglevel(MSG_DEBUG)
		pre.init(trainudat)
		trainudat.add_preprocessor(pre)
		trainudat.apply_preprocessor()
		spec = CommWordStringKernel(10, False)
		spec.set_normalizer(IdentityKernelNormalizer())
		spec.init(trainudat, trainudat)
		K=spec.get_kernel_matrix()

	del POS
	del NEG
	del order
	del gap
	del reverse
	return K
예제 #30
0
def tests_check_commwordkernel_memleak(num, order, gap, reverse):
    import gc
    from shogun import Alphabet, StringCharFeatures, StringWordFeatures, DNA
    from shogun import SortWordString, MSG_DEBUG
    from shogun import CommWordStringKernel, IdentityKernelNormalizer
    from numpy import mat

    POS = [
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT'
    ]
    NEG = [
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT'
    ]

    for i in range(10):
        alpha = Alphabet(DNA)
        traindat = StringCharFeatures(alpha)
        traindat.set_features(POS + NEG)
        trainudat = StringWordFeatures(traindat.get_alphabet())
        trainudat.obtain_from_char(traindat, order - 1, order, gap, reverse)
        #trainudat.io.set_loglevel(MSG_DEBUG)
        pre = SortWordString()
        #pre.io.set_loglevel(MSG_DEBUG)
        pre.init(trainudat)
        trainudat.add_preprocessor(pre)
        trainudat.apply_preprocessor()
        spec = CommWordStringKernel(10, False)
        spec.set_normalizer(IdentityKernelNormalizer())
        spec.init(trainudat, trainudat)
        K = spec.get_kernel_matrix()

    del POS
    del NEG
    del order
    del gap
    del reverse
    return K