def distribution_linearhmm_modular (fm_dna=traindna,order=3,gap=0,reverse=False):

	from shogun.Features import StringWordFeatures, StringCharFeatures, DNA
	from shogun.Distribution import LinearHMM

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_dna)
	feats=StringWordFeatures(charfeat.get_alphabet())
	feats.obtain_from_char(charfeat, order-1, order, gap, reverse)

	hmm=LinearHMM(feats)
	hmm.train()

	hmm.get_transition_probs()

	num_examples=feats.get_num_vectors()
	num_param=hmm.get_num_model_parameters()
	for i in range(num_examples):
		for j in range(num_param):
			hmm.get_log_derivative(j, i)

	out_likelihood = hmm.get_log_likelihood()
	out_sample = hmm.get_log_likelihood_sample()

	return hmm,out_likelihood ,out_sample
def kernel_weighted_comm_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,order=3,gap=0,reverse=True ):
	from shogun.Kernel import WeightedCommWordStringKernel
	from shogun.Features import StringWordFeatures, StringCharFeatures, DNA
	from shogun.Preprocessor import SortWordString

	charfeat=StringCharFeatures(fm_train_dna, DNA)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(fm_test_dna, DNA)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	use_sign=False
	kernel=WeightedCommWordStringKernel(feats_train, feats_train, use_sign)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def histogram ():
	print 'Histogram'

	from shogun.Features import StringWordFeatures, StringCharFeatures, DNA
	from shogun.Distribution import Histogram

	order=3
	gap=0
	reverse=False

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_dna)
	feats=StringWordFeatures(charfeat.get_alphabet())
	feats.obtain_from_char(charfeat, order-1, order, gap, reverse)

	histo=Histogram(feats)
	histo.train()

	histo.get_histogram()

	num_examples=feats.get_num_vectors()
	num_param=histo.get_num_model_parameters()
	#for i in xrange(num_examples):
	#	for j in xrange(num_param):
	#		histo.get_log_derivative(j, i)

	histo.get_log_likelihood()
	histo.get_log_likelihood_sample()
def kernel_salzberg_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,
order=3,gap=0,reverse=False):
	from shogun.Features import StringCharFeatures, StringWordFeatures, DNA, Labels
	from shogun.Kernel import SalzbergWordStringKernel
	from shogun.Classifier import PluginEstimate

	charfeat=StringCharFeatures(fm_train_dna, DNA)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	charfeat=StringCharFeatures(fm_test_dna, DNA)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	pie=PluginEstimate()
	labels=Labels(label_train_dna)
	pie.set_labels(labels)
	pie.set_features(feats_train)
	pie.train()

	kernel=SalzbergWordStringKernel(feats_train, feats_train, pie, labels)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	pie.set_features(feats_test)
	pie.classify().get_labels()
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
Exemplo n.º 5
0
def distribution_hmm_modular(fm_cube, N, M, pseudo, order, gap, reverse, num_examples):
	from shogun.Features import StringWordFeatures, StringCharFeatures, CUBE
	from shogun.Distribution import HMM, BW_NORMAL

	charfeat=StringCharFeatures(CUBE)
	charfeat.set_features(fm_cube)
	feats=StringWordFeatures(charfeat.get_alphabet())
	feats.obtain_from_char(charfeat, order-1, order, gap, reverse)

	hmm=HMM(feats, N, M, pseudo)
	hmm.train()
	hmm.baum_welch_viterbi_train(BW_NORMAL)

	num_examples=feats.get_num_vectors()
	num_param=hmm.get_num_model_parameters()
	for i in xrange(num_examples):
		for j in xrange(num_param):
			hmm.get_log_derivative(j, i)

	best_path=0
	best_path_state=0
	for i in xrange(num_examples):
		best_path+=hmm.best_path(i)
		for j in xrange(N):
			best_path_state+=hmm.get_best_path_state(i, j)

	lik_example = hmm.get_log_likelihood()
	lik_sample = hmm.get_log_likelihood_sample()

	return lik_example, lik_sample, hmm
Exemplo n.º 6
0
def create_promoter_features(data, param):
    """
    creates promoter combined features
    
    @param examples:
    @param param:
    """

    print "creating promoter features"

    (center, left, right) = split_data_promoter(data, param["center_offset"], param["center_pos"])

    # set up base features
    feat_center = StringCharFeatures(DNA)
    feat_center.set_features(center)
    feat_left = get_spectrum_features(left)
    feat_right = get_spectrum_features(right)

    # construct combined features
    feat = CombinedFeatures()
    feat.append_feature_obj(feat_center)
    feat.append_feature_obj(feat_left)
    feat.append_feature_obj(feat_right)

    return feat
Exemplo n.º 7
0
def create_hashed_features_spectrum(param, data):
    """
    creates hashed dot features for the spectrum kernel
    """

    # extract parameters
    order = param["degree_spectrum"]

    # fixed parameters
    gap = 0
    reverse = True 
    normalize = True

    # create features
    feats_char = StringCharFeatures(data, DNA)
    feats_word = StringWordFeatures(feats_char.get_alphabet())
    feats_word.obtain_from_char(feats_char, order-1, order, gap, reverse)

    # create preproc
    preproc = SortWordString()
    preproc.init(feats_word)
    feats_word.add_preproc(preproc)
    feats_word.apply_preproc()

    # finish 
    feats = ImplicitWeightedSpecFeatures(feats_word, normalize)

    return feats
def get_kernel_matrix(li):
    """
    Get kernel matrix from a list of strings.
    """

    order = 6
    gap = 2
    reverse = False
    charfeat = StringCharFeatures(RAWBYTE)
    charfeat.set_features(li)
    #Get alphabet.
    feats_train = StringUlongFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
    #CommUlongStringKernel needs sorted features.
    preproc = SortUlongString()
    preproc.init(feats_train)
    feats_train.add_preproc(preproc)
    feats_train.apply_preproc()

    use_sign = False

    #Compute kernel matrix between train features.
    kernel = CommUlongStringKernel(feats_train, feats_train, use_sign)
    km_train = kernel.get_kernel_matrix()
    return km_train
def linear_hmm ():
	print 'LinearHMM'

	from shogun.Features import StringWordFeatures, StringCharFeatures, DNA
	from shogun.Distribution import LinearHMM

	order=3
	gap=0
	reverse=False

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_dna)
	feats=StringWordFeatures(charfeat.get_alphabet())
	feats.obtain_from_char(charfeat, order-1, order, gap, reverse)

	hmm=LinearHMM(feats)
	hmm.train()

	hmm.get_transition_probs()

	num_examples=feats.get_num_vectors()
	num_param=hmm.get_num_model_parameters()
	for i in xrange(num_examples):
		for j in xrange(num_param):
			hmm.get_log_derivative(j, i)

	hmm.get_log_likelihood()
	hmm.get_log_likelihood_sample()
def sort_word_string ():
	print 'CommWordString'

	from shogun.Kernel import CommWordStringKernel
	from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
	from shogun.PreProc import SortWordString

	order=3
	gap=0
	reverse=False

	charfeat=StringCharFeatures(fm_train_dna, DNA)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preproc(preproc)
	feats_train.apply_preproc()

	charfeat=StringCharFeatures(fm_test_dna, DNA)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preproc(preproc)
	feats_test.apply_preproc()

	use_sign=False

	kernel=CommWordStringKernel(feats_train, feats_train, use_sign)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
Exemplo n.º 11
0
	def get_predictions_from_seqdict(self, seqdic, site):
		""" we need to generate a huge test features object
			containing all locations found in each seqdict-sequence
			and each location (this is necessary to efficiently
			(==fast,low memory) compute the splice outputs
		"""

		seqlen=self.window_right+self.window_left+2

		for s in seqdic:
			position_list=DynamicIntArray()

			sequence=s.seq
			positions=s.preds[site].positions
			for j in xrange(len(positions)):
				i=positions[j] - self.offset -self.window_left
				position_list.append_element(i)

			t=StringCharFeatures([sequence], DNA)
			t.obtain_by_position_list(seqlen, position_list)
			self.wd_kernel.init(self.traindat, t)

			self.wd_kernel.io.enable_progress()
			l=self.svm.apply().get_values()
			self.wd_kernel.cleanup()
			sys.stdout.write("\n...done...\n")

			num=len(s.preds[site].positions)
			scores= num * [0]
			for j in xrange(num):
				scores[j]=l[j]
			s.preds[site].set_scores(scores)
Exemplo n.º 12
0
    def init_sensor(self, kernel, svs):
        f = StringCharFeatures(svs, DNA)

        kname = kernel['name']
        if  kname == 'spectrum':
            wf = StringWordFeatures(f.get_alphabet())
            wf.obtain_from_char(f, kernel['order'] - 1, kernel['order'], 0, False)

            pre = SortWordString()
            pre.init(wf)
            wf.add_preprocessor(pre)
            wf.apply_preprocessor()
            f = wf

            k = CommWordStringKernel(0, False)
            k.set_use_dict_diagonal_optimization(kernel['order'] < 8)
            self.preproc = pre

        elif kname == 'wdshift':
                k = WeightedDegreePositionStringKernel(0, kernel['order'])
                k.set_normalizer(IdentityKernelNormalizer())
                k.set_shifts(kernel['shift'] *
                        numpy.ones(f.get_max_vector_length(), dtype=numpy.int32))
                k.set_position_weights(1.0 / f.get_max_vector_length() *
                        numpy.ones(f.get_max_vector_length(), dtype=numpy.float64))
        else:
            raise "Currently, only wdshift and spectrum kernels supported"

        self.kernel = k
        self.train_features = f

        return (self.kernel, self.train_features)
def preproc_sortwordstring_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False,use_sign=False):

	from shogun.Kernel import CommWordStringKernel
	from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
	from shogun.PreProc import SortWordString

	charfeat=StringCharFeatures(fm_train_dna, DNA)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preproc(preproc)
	feats_train.apply_preproc()

	charfeat=StringCharFeatures(fm_test_dna, DNA)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preproc(preproc)
	feats_test.apply_preproc()

	kernel=CommWordStringKernel(feats_train, feats_train, use_sign)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

	return km_train,km_test,kernel
def plugin_estimate_salzberg ():
	print 'PluginEstimate w/ SalzbergWord'

	from shogun.Features import StringCharFeatures, StringWordFeatures, DNA, Labels
	from shogun.Kernel import SalzbergWordStringKernel
	from shogun.Classifier import PluginEstimate

	order=3
	gap=0
	reverse=False

	charfeat=StringCharFeatures(fm_train_dna, DNA)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	charfeat=StringCharFeatures(fm_test_dna, DNA)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	pie=PluginEstimate()
	labels=Labels(label_train_dna)
	pie.set_labels(labels)
	pie.set_features(feats_train)
	pie.train()

	kernel=SalzbergWordStringKernel(feats_train, feats_test, pie, labels)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	pie.set_features(feats_test)
	pie.classify().get_labels()
	km_test=kernel.get_kernel_matrix()
def get_wd_features(data, feat_type="dna"):
    """
    create feature object for wdk
    """
    if feat_type == "dna":
        feat = StringCharFeatures(DNA)
    elif feat_type == "protein":
        feat = StringCharFeatures(PROTEIN)
    else:
        raise Exception("unknown feature type")
    feat.set_features(data)

    return feat
def kernel_fisher_modular(
    fm_train_dna=traindat,
    fm_test_dna=testdat,
    label_train_dna=label_traindat,
    N=1,
    M=4,
    pseudo=1e-1,
    order=1,
    gap=0,
    reverse=False,
    kargs=[1, False, True],
):

    from shogun.Features import StringCharFeatures, StringWordFeatures, FKFeatures, DNA
    from shogun.Kernel import PolyKernel
    from shogun.Distribution import HMM, BW_NORMAL  # , MSG_DEBUG

    # train HMM for positive class
    charfeat = StringCharFeatures(fm_hmm_pos, DNA)
    # charfeat.io.set_loglevel(MSG_DEBUG)
    hmm_pos_train = StringWordFeatures(charfeat.get_alphabet())
    hmm_pos_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    pos = HMM(hmm_pos_train, N, M, pseudo)
    pos.baum_welch_viterbi_train(BW_NORMAL)

    # train HMM for negative class
    charfeat = StringCharFeatures(fm_hmm_neg, DNA)
    hmm_neg_train = StringWordFeatures(charfeat.get_alphabet())
    hmm_neg_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    neg = HMM(hmm_neg_train, N, M, pseudo)
    neg.baum_welch_viterbi_train(BW_NORMAL)

    # Kernel training data
    charfeat = StringCharFeatures(fm_train_dna, DNA)
    wordfeats_train = StringWordFeatures(charfeat.get_alphabet())
    wordfeats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    # Kernel testing data
    charfeat = StringCharFeatures(fm_test_dna, DNA)
    wordfeats_test = StringWordFeatures(charfeat.get_alphabet())
    wordfeats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    # get kernel on training data
    pos.set_observations(wordfeats_train)
    neg.set_observations(wordfeats_train)
    feats_train = FKFeatures(10, pos, neg)
    feats_train.set_opt_a(-1)  # estimate prior
    kernel = PolyKernel(feats_train, feats_train, *kargs)
    km_train = kernel.get_kernel_matrix()

    # get kernel on testing data
    pos_clone = HMM(pos)
    neg_clone = HMM(neg)
    pos_clone.set_observations(wordfeats_test)
    neg_clone.set_observations(wordfeats_test)
    feats_test = FKFeatures(10, pos_clone, neg_clone)
    feats_test.set_a(feats_train.get_a())  # use prior from training data
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
def manhattan_word_distance ():
	print 'ManhattanWordDistance'

	from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
	from shogun.PreProc import SortWordString
	from shogun.Distance import ManhattanWordDistance

	order=3
	gap=0
	reverse=False

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preproc(preproc)
	feats_train.apply_preproc()

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preproc(preproc)
	feats_test.apply_preproc()

	distance=ManhattanWordDistance(feats_train, feats_train)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
Exemplo n.º 18
0
def distribution_ppwm_modular (fm_dna=traindna, order=3):
	from shogun.Features import StringByteFeatures, StringCharFeatures, DNA
	from shogun.Distribution import PositionalPWM

	from numpy import array,e,log,exp

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_dna)
	feats=StringByteFeatures(charfeat.get_alphabet())
	feats.obtain_from_char(charfeat, order-1, order, 0, False)

	L=20
	k=3
	sigma = 1;
	mu = 4

	ppwm=PositionalPWM()
	ppwm.set_sigma(sigma)
	ppwm.set_mean(mu)
	pwm=array([[0.0, 0.5, 0.1, 1.0],
               [0.0, 0.5, 0.5, 0.0],
               [1.0, 0.0, 0.4, 0.0],
               [0.0, 0.0, 0.0, 0.0]]);
	pwm=array([[0.01,0.09,0.1],[0.09,0.01,0.1],[0.85,0.4,0.1],[0.05,0.5,0.7]])



	ppwm.set_pwm(log(pwm))
	#print(ppwm.get_pwm())
	ppwm.compute_w(L)
	w=ppwm.get_w()
	#print(w)
	#from pylab import *
	#figure(1)
	#pcolor(exp(w))
	#pcolor(w)
	#colorbar()

	#figure(2)
	ppwm.compute_scoring(1)
	u=ppwm.get_scoring(0)
	#pcolor(exp(u))
	#show()

	#ppwm=PositionalPWM(feats)
	#ppwm.train()

	#out_likelihood = histo.get_log_likelihood()
	#out_sample = histo.get_log_likelihood_sample()
	return w,u
def get_spectrum_features(data, order=3, gap=0, reverse=True):
    """
    create feature object used by spectrum kernel
    """

    charfeat = StringCharFeatures(data, DNA)
    feat = StringWordFeatures(charfeat.get_alphabet())
    feat.obtain_from_char(charfeat, order-1, order, gap, reverse)
    preproc = SortWordString()
    preproc.init(feat)
    feat.add_preprocessor(preproc)
    feat.apply_preprocessor()

    return feat
def kernel_comm_ulong_string_modular (fm_train_dna=traindat,fm_test_dna=testdat, order=3, gap=0, reverse = False):

	from shogun.Kernel import CommUlongStringKernel
	from shogun.Features import StringUlongFeatures, StringCharFeatures, DNA
	from shogun.Preprocessor import SortUlongString

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringUlongFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortUlongString()
	preproc.init(feats_train)
	feats_train.add_preproc(preproc)
	feats_train.apply_preproc()


	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringUlongFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preproc(preproc)
	feats_test.apply_preproc()

	use_sign=False

	kernel=CommUlongStringKernel(feats_train, feats_train, use_sign)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def kernel_histogram_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,order=3,gap=0,reverse=False):

	from shogun.Features import StringCharFeatures, StringWordFeatures, DNA, Labels
	from shogun.Kernel import HistogramWordStringKernel
	from shogun.Classifier import PluginEstimate#, MSG_DEBUG

	reverse = reverse
	charfeat=StringCharFeatures(DNA)
	#charfeat.io.set_loglevel(MSG_DEBUG)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	pie=PluginEstimate()
	labels=Labels(label_train_dna)
	pie.set_labels(labels)
	pie.set_features(feats_train)
	pie.train()

	kernel=HistogramWordStringKernel(feats_train, feats_train, pie)
	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	pie.set_features(feats_test)
	pie.apply().get_labels()
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def distance_hammingword_modular (fm_train_dna=traindna,fm_test_dna=testdna,
		fm_test_real=testdat,order=3,gap=0,reverse=False,use_sign=False):

	from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
	from shogun.Preprocessor import SortWordString
	from shogun.Distance import HammingWordDistance

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	distance=HammingWordDistance(feats_train, feats_train, use_sign)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return distance,dm_train,dm_test
Exemplo n.º 23
0
def perform_clustering(mss_id):

    import numpy
    import expenv
    
    mss = expenv.MultiSplitSet.get(mss_id)
    


    from method_mhc_mkl import SequencesHandler
    from shogun.Distance import EuclidianDistance, HammingWordDistance
    from shogun.Features import StringCharFeatures, StringWordFeatures, PROTEIN
    from shogun.Clustering import Hierarchical
    from shogun.PreProc import SortWordString
    
    order = 1
    gap = 0
    reverse = False
    
    seq_handler = SequencesHandler()
    
    data = [seq_handler.get_seq(ss.dataset.organism) for ss in mss.split_sets] 

    charfeat=StringCharFeatures(PROTEIN)
    charfeat.set_features(data)
    feats=StringWordFeatures(charfeat.get_alphabet())
    feats.obtain_from_char(charfeat, order-1, order, gap, reverse)
    preproc=SortWordString()
    preproc.init(feats)
    feats.add_preproc(preproc)
    feats.apply_preproc()

    
    use_sign = False

    distance = HammingWordDistance(feats, feats, use_sign)
    #distance = EuclidianDistance()
    
    merges=4
    hierarchical=Hierarchical(merges, distance)
    hierarchical.train()

    hierarchical.get_merge_distances()
    hierarchical.get_cluster_pairs()
    
    
    return hierarchical
Exemplo n.º 24
0
def kernel_top_modular(
    fm_train_dna=traindat,
    fm_test_dna=testdat,
    label_train_dna=label_traindat,
    pseudo=1e-1,
    order=1,
    gap=0,
    reverse=False,
    kargs=[1, False, True],
):
    from shogun.Features import StringCharFeatures, StringWordFeatures, TOPFeatures, DNA
    from shogun.Kernel import PolyKernel
    from shogun.Distribution import HMM, BW_NORMAL

    N = 1  # toy HMM with 1 state
    M = 4  # 4 observations -> DNA

    # train HMM for positive class
    charfeat = StringCharFeatures(fm_hmm_pos, DNA)
    hmm_pos_train = StringWordFeatures(charfeat.get_alphabet())
    hmm_pos_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    pos = HMM(hmm_pos_train, N, M, pseudo)
    pos.baum_welch_viterbi_train(BW_NORMAL)

    # train HMM for negative class
    charfeat = StringCharFeatures(fm_hmm_neg, DNA)
    hmm_neg_train = StringWordFeatures(charfeat.get_alphabet())
    hmm_neg_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    neg = HMM(hmm_neg_train, N, M, pseudo)
    neg.baum_welch_viterbi_train(BW_NORMAL)

    # Kernel training data
    charfeat = StringCharFeatures(fm_train_dna, DNA)
    wordfeats_train = StringWordFeatures(charfeat.get_alphabet())
    wordfeats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    # Kernel testing data
    charfeat = StringCharFeatures(fm_test_dna, DNA)
    wordfeats_test = StringWordFeatures(charfeat.get_alphabet())
    wordfeats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    # get kernel on training data
    pos.set_observations(wordfeats_train)
    neg.set_observations(wordfeats_train)
    feats_train = TOPFeatures(10, pos, neg, False, False)
    kernel = PolyKernel(feats_train, feats_train, *kargs)
    km_train = kernel.get_kernel_matrix()

    # get kernel on testing data
    pos_clone = HMM(pos)
    neg_clone = HMM(neg)
    pos_clone.set_observations(wordfeats_test)
    neg_clone.set_observations(wordfeats_test)
    feats_test = TOPFeatures(10, pos_clone, neg_clone, False, False)
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
def fisher ():
	print "Fisher Kernel"
	from shogun.Features import StringCharFeatures, StringWordFeatures, FKFeatures, DNA
	from shogun.Kernel import PolyKernel
	from shogun.Distribution import HMM, BW_NORMAL

	N=1 # toy HMM with 1 state 
	M=4 # 4 observations -> DNA
	pseudo=1e-1
	order=1
	gap=0
	reverse=False
	kargs=[1, False, True]

	# train HMM for positive class
	charfeat=StringCharFeatures(fm_hmm_pos, DNA)
	hmm_pos_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	pos=HMM(hmm_pos_train, N, M, pseudo)
	pos.baum_welch_viterbi_train(BW_NORMAL)

	# train HMM for negative class
	charfeat=StringCharFeatures(fm_hmm_neg, DNA)
	hmm_neg_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	neg=HMM(hmm_neg_train, N, M, pseudo)
	neg.baum_welch_viterbi_train(BW_NORMAL)

	# Kernel training data
	charfeat=StringCharFeatures(fm_train_dna, DNA)
	wordfeats_train=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	# Kernel testing data
	charfeat=StringCharFeatures(fm_test_dna, DNA)
	wordfeats_test=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	# get kernel on training data
	pos.set_observations(wordfeats_train)
	neg.set_observations(wordfeats_train)
	feats_train=FKFeatures(10, pos, neg)
	feats_train.set_opt_a(-1) #estimate prior
	kernel=PolyKernel(feats_train, feats_train, *kargs)
	km_train=kernel.get_kernel_matrix()

	# get kernel on testing data
	pos_clone=HMM(pos)
	neg_clone=HMM(neg)
	pos_clone.set_observations(wordfeats_test)
	neg_clone.set_observations(wordfeats_test)
	feats_test=FKFeatures(10, pos_clone, neg_clone)
	feats_test.set_a(feats_train.get_a()) #use prior from training data
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
def features_string_char_modular(strings):
    from shogun.Features import StringCharFeatures, RAWBYTE
    from numpy import array

    # create string features
    f = StringCharFeatures(strings, RAWBYTE)

    # and output several stats
    # print "max string length", f.get_max_vector_length()
    # print "number of strings", f.get_num_vectors()
    # print "length of first string", f.get_vector_length(0)
    # print "string[5]", ''.join(f.get_feature_vector(5))
    # print "strings", f.get_features()

    # replace string 0
    f.set_feature_vector(array(["t", "e", "s", "t"]), 0)

    # print "strings", f.get_features()
    return f.get_features(), f
Exemplo n.º 27
0
def distribution_ppwm_modular(fm_dna=traindna, order=3):
    from shogun.Features import StringByteFeatures, StringCharFeatures, DNA
    from shogun.Distribution import PositionalPWM

    from numpy import array, e, log, exp

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_dna)
    feats = StringByteFeatures(charfeat.get_alphabet())
    feats.obtain_from_char(charfeat, order - 1, order, 0, False)

    ppwm = PositionalPWM()
    ppwm.set_sigma(5.0)
    ppwm.set_mean(10.0)
    pwm = array([[0.0, 0.5, 0.1, 1.0], [0.0, 0.5, 0.5, 0.0], [1.0, 0.0, 0.4, 0.0], [0.0, 0.0, 0.0, 0.0]])
    ppwm.set_pwm(log(pwm))
    print ppwm.get_pwm()
    ppwm.compute_w(20)
    w = ppwm.get_w()
Exemplo n.º 28
0
	def get_predictions(self, sequence, positions):

		seqlen=self.window_right+self.window_left+2
		num=len(positions)

		testdat = []

		for j in xrange(num):
			i=positions[j] - self.offset ;
			s=sequence[i-self.window_left:i+self.window_right+2]
			testdat.append(s)

		t=StringCharFeatures(DNA)
		t.set_string_features(testdat)

		self.wd_kernel.init(self.traindat, t)
		l=self.svm.classify().get_labels()
		sys.stderr.write("\n...done...\n")
		return l
Exemplo n.º 29
0
	def get_predictions(self, sequence, positions):

		seqlen=self.window_right+self.window_left+2
		num=len(positions)

		position_list=DynamicIntArray()

		for j in xrange(num):
			i=positions[j] - self.offset - self.window_left
			position_list.append_element(i)

		t=StringCharFeatures([sequence], DNA)
		t.obtain_by_position_list(seqlen, position_list)
		self.wd_kernel.init(self.traindat, t)
		del t

		self.wd_kernel.io.enable_progress()
		l=self.svm.apply().get_values()
		self.wd_kernel.cleanup()
		sys.stdout.write("\n...done...\n")
		return l
def classifier_svmlight_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,C=1.2,epsilon=1e-5,num_threads=1):
	from shogun.Features import StringCharFeatures, Labels, DNA
	from shogun.Kernel import WeightedDegreeStringKernel
	try:
		from shogun.Classifier import SVMLight
	except ImportError:
		print 'No support for SVMLight available.'
		return

	feats_train=StringCharFeatures(DNA)
	feats_train.set_features(fm_train_dna)
	feats_test=StringCharFeatures(DNA)
	feats_test.set_features(fm_test_dna)
	degree=20

	kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)

	labels=Labels(label_train_dna)

	svm=SVMLight(C, kernel, labels)
	svm.set_epsilon(epsilon)
	svm.parallel.set_num_threads(num_threads)
	svm.train()

	kernel.init(feats_train, feats_test)
	svm.apply().get_labels()
	return kernel
Exemplo n.º 31
0
def create_features(kname, examples, kparam, train_mode, preproc, seq_source, nuc_con):
    """Converts numpy arrays or sequences into shogun features"""

    if kname == 'gauss' or kname == 'linear' or kname == 'poly':
        examples = numpy.array(examples)
        feats = RealFeatures(examples)
        
    elif kname == 'wd' or kname == 'localalign' or kname == 'localimprove':
        if seq_source == 'dna': 
            examples = non_atcg_convert(examples, nuc_con)
            feats = StringCharFeatures(examples, DNA)
        elif seq_source == 'protein':
            examples = non_aminoacid_converter(examples, nuc_con) 
            feats = StringCharFeatures(examples, PROTEIN)
        else:
            sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n")
            sys.exit(-1)

    elif kname == 'spec' or kname == 'cumspec':
        if seq_source == 'dna':
            examples = non_atcg_convert(examples, nuc_con)
            feats = StringCharFeatures(examples, DNA) 
        elif seq_source == 'protein':    
            examples = non_aminoacid_converter(examples, nuc_con)
            feats = StringCharFeatures(examples, PROTEIN)
        else:
            sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n")
            sys.exit(-1)
       
        wf = StringUlongFeatures( feats.get_alphabet() )
        wf.obtain_from_char(feats, kparam['degree']-1, kparam['degree'], 0, kname=='cumspec')
        del feats

        if train_mode:
            preproc = SortUlongString()
            preproc.init(wf)
        wf.add_preprocessor(preproc)
        ret = wf.apply_preprocessor()
        #assert(ret)

        feats = wf
    elif kname == 'spec2' or kname == 'cumspec2':
        # spectrum kernel on two sequences
        feats = {}
        feats['combined'] = CombinedFeatures()

        reversed = kname=='cumspec2'

        (ex0,ex1) = zip(*examples)

        f0 = StringCharFeatures(list(ex0), DNA)
        wf = StringWordFeatures(f0.get_alphabet())
        wf.obtain_from_char(f0, kparam['degree']-1, kparam['degree'], 0, reversed)
        del f0

        if train_mode:
            preproc = SortWordString()
            preproc.init(wf)
        wf.add_preprocessor(preproc)
        ret = wf.apply_preprocessor()
        assert(ret)
        feats['combined'].append_feature_obj(wf)
        feats['f0'] = wf

        f1 = StringCharFeatures(list(ex1), DNA)
        wf = StringWordFeatures( f1.get_alphabet() )
        wf.obtain_from_char(f1, kparam['degree']-1, kparam['degree'], 0, reversed)
        del f1

        if train_mode:
            preproc = SortWordString()
            preproc.init(wf)
        wf.add_preprocessor(preproc)
        ret = wf.apply_preprocessor()
        assert(ret)
        feats['combined'].append_feature_obj(wf)
        feats['f1'] = wf

    else:
        print 'Unknown kernel %s' % kname
    
    return (feats,preproc)
Exemplo n.º 32
0
def kernel_fisher_modular(fm_train_dna=traindat,
                          fm_test_dna=testdat,
                          label_train_dna=label_traindat,
                          N=1,
                          M=4,
                          pseudo=1e-1,
                          order=1,
                          gap=0,
                          reverse=False,
                          kargs=[1, False, True]):

    from shogun.Features import StringCharFeatures, StringWordFeatures, FKFeatures, DNA
    from shogun.Kernel import PolyKernel
    from shogun.Distribution import HMM, BW_NORMAL  #, MSG_DEBUG

    # train HMM for positive class
    charfeat = StringCharFeatures(fm_hmm_pos, DNA)
    #charfeat.io.set_loglevel(MSG_DEBUG)
    hmm_pos_train = StringWordFeatures(charfeat.get_alphabet())
    hmm_pos_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    pos = HMM(hmm_pos_train, N, M, pseudo)
    pos.baum_welch_viterbi_train(BW_NORMAL)

    # train HMM for negative class
    charfeat = StringCharFeatures(fm_hmm_neg, DNA)
    hmm_neg_train = StringWordFeatures(charfeat.get_alphabet())
    hmm_neg_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    neg = HMM(hmm_neg_train, N, M, pseudo)
    neg.baum_welch_viterbi_train(BW_NORMAL)

    # Kernel training data
    charfeat = StringCharFeatures(fm_train_dna, DNA)
    wordfeats_train = StringWordFeatures(charfeat.get_alphabet())
    wordfeats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    # Kernel testing data
    charfeat = StringCharFeatures(fm_test_dna, DNA)
    wordfeats_test = StringWordFeatures(charfeat.get_alphabet())
    wordfeats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    # get kernel on training data
    pos.set_observations(wordfeats_train)
    neg.set_observations(wordfeats_train)
    feats_train = FKFeatures(10, pos, neg)
    feats_train.set_opt_a(-1)  #estimate prior
    kernel = PolyKernel(feats_train, feats_train, *kargs)
    km_train = kernel.get_kernel_matrix()

    # get kernel on testing data
    pos_clone = HMM(pos)
    neg_clone = HMM(neg)
    pos_clone.set_observations(wordfeats_test)
    neg_clone.set_observations(wordfeats_test)
    feats_test = FKFeatures(10, pos_clone, neg_clone)
    feats_test.set_a(feats_train.get_a())  #use prior from training data
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
def features_string_char_compressed_modular(fname):
	from shogun.Features import StringCharFeatures, StringFileCharFeatures, RAWBYTE
	from shogun.Library import UNCOMPRESSED,LZO,GZIP,BZIP2,LZMA, MSG_DEBUG
	from shogun.PreProc import DecompressCharString

	f=StringFileCharFeatures(fname, RAWBYTE)

	#print "original strings", f.get_features()

	#uncompressed
	f.save_compressed("foo_uncompressed.str", UNCOMPRESSED, 1)
	f2=StringCharFeatures(RAWBYTE);
	f2.load_compressed("foo_uncompressed.str", True)
	#print "uncompressed strings", f2.get_features()
	#print

	# load compressed data and uncompress on load

	#lzo
	f.save_compressed("foo_lzo.str", LZO, 9)
	f2=StringCharFeatures(RAWBYTE);
	f2.load_compressed("foo_lzo.str", True)
	#print "lzo strings", f2.get_features()
	#print

	##gzip
	f.save_compressed("foo_gzip.str", GZIP, 9)
	f2=StringCharFeatures(RAWBYTE);
	f2.load_compressed("foo_gzip.str", True)
	#print "gzip strings", f2.get_features()
	#print

	#bzip2
	f.save_compressed("foo_bzip2.str", BZIP2, 9)
	f2=StringCharFeatures(RAWBYTE);
	f2.load_compressed("foo_bzip2.str", True)
	#print "bzip2 strings", f2.get_features()
	#print

	#lzma
	f.save_compressed("foo_lzma.str", LZMA, 9)
	f2=StringCharFeatures(RAWBYTE);
	f2.load_compressed("foo_lzma.str", True)
	#print "lzma strings", f2.get_features()
	#print

	# load compressed data and uncompress via preprocessor
	f2=StringCharFeatures(RAWBYTE);
	f2.load_compressed("foo_lzo.str", False)
	f2.add_preproc(DecompressCharString(LZO))
	f2.apply_preproc()
	#print "lzo strings", f2.get_features()
	#print

	# load compressed data and uncompress on-the-fly via preprocessor
	f2=StringCharFeatures(RAWBYTE);
	f2.load_compressed("foo_lzo.str", False)
	#f2.io.set_loglevel(MSG_DEBUG)
	f2.add_preproc(DecompressCharString(LZO))
	f2.enable_on_the_fly_preprocessing()
	#print "lzo strings", f2.get_features()
	#print

	#clean up
	import os
	for f in ['foo_uncompressed.str', 'foo_lzo.str', 'foo_gzip.str',
	'foo_bzip2.str', 'foo_lzma.str', 'foo_lzo.str', 'foo_lzo.str']:
		if os.path.exists(f):
			os.unlink(f)
d = dat["thaliana"]
subset_size = 20

examples = [i.example for i in d[0:subset_size]]
labels = [i.label for i in d[0:subset_size]]

print "len(examples)", len(examples)
print "string length", len(examples[0])

labels[2] = 1
labels[12] = 1
labels[15] = 1
labels[8] = 1
labels[19] = 1

feat = StringCharFeatures(DNA)
feat.set_features(examples)

helper.save("/tmp/feat", feat)
feat2 = helper.load("/tmp/feat")

wdk = WeightedDegreeStringKernel(feat, feat, 1)

print "PY: saving kernel"
wdk.io.set_loglevel(MSG_DEBUG)
helper.save("/tmp/awesome", wdk)
#print wdk.toString()
#print "PY: kernel saved, loading kernel"
wdk2 = helper.load("/tmp/awesome")
print "PY: kernel loaded"
Exemplo n.º 35
0
    def solve(self, C, all_xt, all_lt, task_indicator, M, L):
        """
        implementation using multitask kernel
        """

        xt = numpy.array(all_xt)
        lt = numpy.array(all_lt)
        tt = numpy.array(task_indicator, dtype=numpy.int32)
        tsm = numpy.array(M)

        print "task_sim:", tsm

        num_tasks = L.shape[0]

        # sanity checks
        assert len(xt) == len(lt) == len(tt)
        assert M.shape == L.shape
        assert num_tasks == len(set(tt))

        # set up shogun objects
        if type(xt[0]) == numpy.string_:
            feat = StringCharFeatures(DNA)
            xt = [str(a) for a in xt]
            feat.set_features(xt)
            base_kernel = WeightedDegreeStringKernel(feat, feat, 8)
        else:
            feat = RealFeatures(xt.T)
            base_kernel = LinearKernel(feat, feat)

        lab = BinaryLabels(lt)

        # set up normalizer
        normalizer = MultitaskKernelNormalizer(tt.tolist())

        for i in xrange(num_tasks):
            for j in xrange(num_tasks):
                normalizer.set_task_similarity(i, j, M[i, j])

        print "num of unique tasks: ", normalizer.get_num_unique_tasks(
            task_indicator)

        # set up kernel
        base_kernel.set_cache_size(4000)
        base_kernel.set_normalizer(normalizer)
        base_kernel.init_normalizer()

        # set up svm
        svm = SVMLight()  #LibSVM()

        svm.set_epsilon(self.eps)

        #SET THREADS TO 1
        #print "reducing num threads to one"
        #segfaults
        #svm.parallel.set_num_threads(1)
        #print "using one thread"

        # how often do we like to compute objective etc
        svm.set_record_interval(self.record_interval)
        svm.set_min_interval(self.min_interval)
        #svm.set_target_objective(target_obj)

        svm.set_linadd_enabled(False)
        svm.set_batch_computation_enabled(False)
        #svm.set_shrinking_enabled(False)
        svm.io.set_loglevel(MSG_DEBUG)

        svm.set_C(C, C)
        svm.set_bias_enabled(False)

        # prepare for training
        svm.set_labels(lab)
        svm.set_kernel(base_kernel)

        # train svm
        svm.train()

        if self.record_variables:

            print "recording variables"

            self.dual_objectives = [-obj for obj in svm.get_dual_objectives()]
            self.train_times = svm.get_training_times()

            # get model parameters
            sv_idx = svm.get_support_vectors()
            sparse_alphas = svm.get_alphas()

            assert len(sv_idx) == len(sparse_alphas)

            # compute dense alpha (remove label)
            self.alphas = numpy.zeros(len(xt))
            for id_sparse, id_dense in enumerate(sv_idx):
                self.alphas[id_dense] = sparse_alphas[id_sparse] * lt[id_dense]

            # print alphas
            W = alphas_to_w(self.alphas, xt, lt, task_indicator, M)
            self.W = W

            #
            self.final_primal_obj = compute_primal_objective(
                W.reshape(W.shape[0] * W.shape[1]), C, all_xt, all_lt,
                task_indicator, L)

            print "MTK duality gap:", self.dual_objectives[
                -1] - self.final_primal_obj

        return True
Exemplo n.º 36
0
def kernel_top_modular(fm_train_dna=traindat,
                       fm_test_dna=testdat,
                       label_train_dna=label_traindat,
                       pseudo=1e-1,
                       order=1,
                       gap=0,
                       reverse=False,
                       kargs=[1, False, True]):
    from shogun.Features import StringCharFeatures, StringWordFeatures, TOPFeatures, DNA
    from shogun.Kernel import PolyKernel
    from shogun.Distribution import HMM, BW_NORMAL

    N = 1  # toy HMM with 1 state
    M = 4  # 4 observations -> DNA

    # train HMM for positive class
    charfeat = StringCharFeatures(fm_hmm_pos, DNA)
    hmm_pos_train = StringWordFeatures(charfeat.get_alphabet())
    hmm_pos_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    pos = HMM(hmm_pos_train, N, M, pseudo)
    pos.baum_welch_viterbi_train(BW_NORMAL)

    # train HMM for negative class
    charfeat = StringCharFeatures(fm_hmm_neg, DNA)
    hmm_neg_train = StringWordFeatures(charfeat.get_alphabet())
    hmm_neg_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    neg = HMM(hmm_neg_train, N, M, pseudo)
    neg.baum_welch_viterbi_train(BW_NORMAL)

    # Kernel training data
    charfeat = StringCharFeatures(fm_train_dna, DNA)
    wordfeats_train = StringWordFeatures(charfeat.get_alphabet())
    wordfeats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    # Kernel testing data
    charfeat = StringCharFeatures(fm_test_dna, DNA)
    wordfeats_test = StringWordFeatures(charfeat.get_alphabet())
    wordfeats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    # get kernel on training data
    pos.set_observations(wordfeats_train)
    neg.set_observations(wordfeats_train)
    feats_train = TOPFeatures(10, pos, neg, False, False)
    kernel = PolyKernel(feats_train, feats_train, *kargs)
    km_train = kernel.get_kernel_matrix()

    # get kernel on testing data
    pos_clone = HMM(pos)
    neg_clone = HMM(neg)
    pos_clone.set_observations(wordfeats_test)
    neg_clone.set_observations(wordfeats_test)
    feats_test = TOPFeatures(10, pos_clone, neg_clone, False, False)
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
def features_string_sliding_window_modular(strings):
    from shogun.Features import StringCharFeatures, DNA
    from shogun.Library import DynamicIntArray

    f = StringCharFeatures([strings], DNA)

    # slide a window of length 5 over features
    # (memory efficient, does not copy strings)
    f.obtain_by_sliding_window(5, 1)
    #print(f.get_num_vectors())
    #print(f.get_vector_length(0))
    #print(f.get_vector_length(1))
    #print(f.get_features())

    # slide a window of length 4 over features
    # (memory efficient, does not copy strings)
    f.obtain_by_sliding_window(4, 1)
    #print(f.get_num_vectors())
    #print(f.get_vector_length(0))
    #print(f.get_vector_length(1))
    #print(f.get_features())

    # extract string-windows at position 0,6,16,25 of window size 4
    # (memory efficient, does not copy strings)
    f.set_features([s])
    positions = DynamicIntArray()
    positions.append_element(0)
    positions.append_element(6)
    positions.append_element(16)
    positions.append_element(25)

    f.obtain_by_position_list(4, positions)
    #print(f.get_features())

    # now extract windows of size 8 from same positon list
    f.obtain_by_position_list(8, positions)
    #print(f.get_features())
    return f