示例#1
0
def create_hashed_features_spectrum(param, data):
    """
    creates hashed dot features for the spectrum kernel
    """

    # extract parameters
    order = param["degree_spectrum"]

    # fixed parameters
    gap = 0
    reverse = True 
    normalize = True

    # create features
    feats_char = StringCharFeatures(data, DNA)
    feats_word = StringWordFeatures(feats_char.get_alphabet())
    feats_word.obtain_from_char(feats_char, order-1, order, gap, reverse)

    # create preproc
    preproc = SortWordString()
    preproc.init(feats_word)
    feats_word.add_preproc(preproc)
    feats_word.apply_preproc()

    # finish 
    feats = ImplicitWeightedSpecFeatures(feats_word, normalize)

    return feats
def manhattan_word_distance ():
	print 'ManhattanWordDistance'

	from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
	from shogun.PreProc import SortWordString
	from shogun.Distance import ManhattanWordDistance

	order=3
	gap=0
	reverse=False

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preproc(preproc)
	feats_train.apply_preproc()

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preproc(preproc)
	feats_test.apply_preproc()

	distance=ManhattanWordDistance(feats_train, feats_train)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
示例#3
0
    def init_sensor(self, kernel, svs):
        f = StringCharFeatures(svs, DNA)

        kname = kernel['name']
        if kname == 'spectrum':
            wf = StringWordFeatures(f.get_alphabet())
            wf.obtain_from_char(f, kernel['order'] - 1, kernel['order'], 0,
                                False)

            pre = SortWordString()
            pre.init(wf)
            wf.add_preproc(pre)
            wf.apply_preproc()
            f = wf

            k = CommWordStringKernel(0, False)
            k.set_use_dict_diagonal_optimization(kernel['order'] < 8)
            self.preproc = pre

        elif kname == 'wdshift':
            k = WeightedDegreePositionStringKernel(0, kernel['order'])
            k.set_normalizer(IdentityKernelNormalizer())
            k.set_shifts(
                kernel['shift'] *
                numpy.ones(f.get_max_vector_length(), dtype=numpy.int32))
            k.set_position_weights(
                1.0 / f.get_max_vector_length() *
                numpy.ones(f.get_max_vector_length(), dtype=numpy.float64))
        else:
            raise "Currently, only wdshift and spectrum kernels supported"

        self.kernel = k
        self.train_features = f

        return (self.kernel, self.train_features)
示例#4
0
def non_redundant_word_features(feats, kmerlen):
    """convert the features from Shogun toolbox to non-redundant word features (handle reverse complements)
	Arguments:
	feats -- StringWordFeatures
	kmerlen -- integer, length of k-mer

	Return:
	StringWordFeatures after converting reverse complement k-mer ids
	"""

    rcmap = g_rcmap

    for i in xrange(feats.get_num_vectors()):
        nf = [rcmap[int(kmerid)] for kmerid in feats.get_feature_vector(i)]

        feats.set_feature_vector(numpy.array(nf, numpy.dtype('u2')), i)

    preproc = SortWordString()
    preproc.init(feats)
    try:
        feats.add_preproc(preproc)
        feats.apply_preproc()
    except AttributeError:
        feats.add_preprocessor(preproc)
        feats.apply_preprocessor()

    return feats
def sort_word_string ():
	print 'CommWordString'

	from shogun.Kernel import CommWordStringKernel
	from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
	from shogun.PreProc import SortWordString

	order=3
	gap=0
	reverse=False

	charfeat=StringCharFeatures(fm_train_dna, DNA)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preproc(preproc)
	feats_train.apply_preproc()

	charfeat=StringCharFeatures(fm_test_dna, DNA)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preproc(preproc)
	feats_test.apply_preproc()

	use_sign=False

	kernel=CommWordStringKernel(feats_train, feats_train, use_sign)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
def preproc_sortwordstring_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False,use_sign=False):

	from shogun.Kernel import CommWordStringKernel
	from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
	from shogun.PreProc import SortWordString

	charfeat=StringCharFeatures(fm_train_dna, DNA)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preproc(preproc)
	feats_train.apply_preproc()

	charfeat=StringCharFeatures(fm_test_dna, DNA)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preproc(preproc)
	feats_test.apply_preproc()

	kernel=CommWordStringKernel(feats_train, feats_train, use_sign)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

	return km_train,km_test,kernel
def distance_canberraword_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False):
	from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
	from shogun.PreProc import SortWordString
	from shogun.Distance import CanberraWordDistance
	
	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preproc(preproc)
	feats_train.apply_preproc()

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preproc(preproc)
	feats_test.apply_preproc()

	distance=CanberraWordDistance(feats_train, feats_train)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return distance,dm_train,dm_test
def non_redundant_word_features(feats, kmerlen):
	"""convert the features from Shogun toolbox to non-redundant word features (handle reverse complements)
	Arguments:
	feats -- StringWordFeatures
	kmerlen -- integer, length of k-mer

	Return:
	StringWordFeatures after converting reverse complement k-mer ids
	"""

	rcmap = g_rcmap

	for i in xrange(feats.get_num_vectors()):
		nf = [rcmap[int(kmerid)] for kmerid in feats.get_feature_vector(i)]

		feats.set_feature_vector(numpy.array(nf, numpy.dtype('u2')), i)

	preproc = SortWordString()
	preproc.init(feats)
	try:
		feats.add_preproc(preproc)
		feats.apply_preproc()
	except AttributeError:
		feats.add_preprocessor(preproc)
		feats.apply_preprocessor()	

	return feats
示例#9
0
    def init_sensor(self, kernel, svs):
        f = StringCharFeatures(svs, DNA)

        kname = kernel['name']
        if  kname == 'spectrum':
            wf = StringWordFeatures(f.get_alphabet())
            wf.obtain_from_char(f, kernel['order'] - 1, kernel['order'], 0, False)

            pre = SortWordString()
            pre.init(wf)
            wf.add_preproc(pre)
            wf.apply_preproc()
            f = wf

            k = CommWordStringKernel(0, False)
            k.set_use_dict_diagonal_optimization(kernel['order'] < 8)
            self.preproc = pre

        elif kname == 'wdshift':
                k = WeightedDegreePositionStringKernel(0, kernel['order'])
                k.set_normalizer(IdentityKernelNormalizer())
                k.set_shifts(kernel['shift'] *
                        numpy.ones(f.get_max_vector_length(), dtype=numpy.int32))
                k.set_position_weights(1.0 / f.get_max_vector_length() *
                        numpy.ones(f.get_max_vector_length(), dtype=numpy.float64))
        else:
            raise "Currently, only wdshift and spectrum kernels supported"

        self.kernel = k
        self.train_features = f

        return (self.kernel, self.train_features)
def create_hashed_features_spectrum(param, data):
    """
    creates hashed dot features for the spectrum kernel
    """

    # extract parameters
    order = param["degree_spectrum"]

    # fixed parameters
    gap = 0
    reverse = True
    normalize = True

    # create features
    feats_char = StringCharFeatures(data, DNA)
    feats_word = StringWordFeatures(feats_char.get_alphabet())
    feats_word.obtain_from_char(feats_char, order - 1, order, gap, reverse)

    # create preproc
    preproc = SortWordString()
    preproc.init(feats_word)
    feats_word.add_preproc(preproc)
    feats_word.apply_preproc()

    # finish
    feats = ImplicitWeightedSpecFeatures(feats_word, normalize)

    return feats
def distance_hammingword_modular(fm_train_dna=traindna,
                                 fm_test_dna=testdna,
                                 fm_test_real=testdat,
                                 order=3,
                                 gap=0,
                                 reverse=False,
                                 use_sign=False):

    from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
    from shogun.PreProc import SortWordString
    from shogun.Distance import HammingWordDistance

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_train_dna)
    feats_train = StringWordFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    preproc = SortWordString()
    preproc.init(feats_train)
    feats_train.add_preproc(preproc)
    feats_train.apply_preproc()

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_test_dna)
    feats_test = StringWordFeatures(charfeat.get_alphabet())
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    feats_test.add_preproc(preproc)
    feats_test.apply_preproc()

    distance = HammingWordDistance(feats_train, feats_train, use_sign)

    dm_train = distance.get_distance_matrix()
    distance.init(feats_train, feats_test)
    dm_test = distance.get_distance_matrix()
    return distance, dm_train, dm_test
def get_spectrum_features(data, order=3, gap=0, reverse=True):
    """
    create feature object used by spectrum kernel
    """

    charfeat = StringCharFeatures(data, DNA)
    feat = StringWordFeatures(charfeat.get_alphabet())
    feat.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    preproc = SortWordString()
    preproc.init(feat)
    feat.add_preprocessor(preproc)
    feat.apply_preprocessor()

    return feat
示例#13
0
    def K(self,logtheta,*args):
        """new version using shogun"""
        x1 = args[0][:,self.index].copy()
        if(len(args)==1):
            x2 = x1.copy()
        else:
           x2 = args[1][:,self.index].copy()
        V0 = exp(2*logtheta[0])

        #try to get K from cache
        if (self.x1_.shape[0]==x1.shape[0]) and (self.x2_.shape[0]==x2.shape[0]):
            if ((self.x1_==x1).all() and (self.x2_==x2).all()):
                K = self.K_
                K = K*V0
                return K
            

        DNA = 0
        order = 3
        gap =0
        reverse = False
        charfeat=StringCharFeatures(DNA)
	charfeat.set_string_features(x1.tolist())
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preproc(preproc)
	feats_train.apply_preproc()

	charfeat=StringCharFeatures(DNA)
	charfeat.set_string_features(x2.tolist())
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preproc(preproc)
	feats_test.apply_preproc()
	use_sign=False

        kernel=CommWordStringKernel(feats_train, feats_train, use_sign)
        kernel.init(feats_train,feats_test)
	K=kernel.get_kernel_matrix()
        self.K_ = K

        self.x1_ = x1
        self.x2_ = x2


        #scale factor
        K=K*V0
        return K
示例#14
0
def get_spectrum_features(data, order=3, gap=0, reverse=True):
    """
    create feature object used by spectrum kernel
    """

    charfeat = StringCharFeatures(data, DNA)
    feat = StringWordFeatures(charfeat.get_alphabet())
    feat.obtain_from_char(charfeat, order-1, order, gap, reverse)
    preproc = SortWordString()                                            
    preproc.init(feat)
    feat.add_preprocessor(preproc)
    feat.apply_preprocessor()

    return feat
示例#15
0
def perform_clustering(mss_id):

    import numpy
    import expenv
    
    mss = expenv.MultiSplitSet.get(mss_id)
    


    from method_mhc_mkl import SequencesHandler
    from shogun.Distance import EuclidianDistance, HammingWordDistance
    from shogun.Features import StringCharFeatures, StringWordFeatures, PROTEIN
    from shogun.Clustering import Hierarchical
    from shogun.PreProc import SortWordString
    
    order = 1
    gap = 0
    reverse = False
    
    seq_handler = SequencesHandler()
    
    data = [seq_handler.get_seq(ss.dataset.organism) for ss in mss.split_sets] 

    charfeat=StringCharFeatures(PROTEIN)
    charfeat.set_features(data)
    feats=StringWordFeatures(charfeat.get_alphabet())
    feats.obtain_from_char(charfeat, order-1, order, gap, reverse)
    preproc=SortWordString()
    preproc.init(feats)
    feats.add_preproc(preproc)
    feats.apply_preproc()

    
    use_sign = False

    distance = HammingWordDistance(feats, feats, use_sign)
    #distance = EuclidianDistance()
    
    merges=4
    hierarchical=Hierarchical(merges, distance)
    hierarchical.train()

    hierarchical.get_merge_distances()
    hierarchical.get_cluster_pairs()
    
    
    return hierarchical
示例#16
0
def perform_clustering(mss_id):

    import numpy
    import expenv

    mss = expenv.MultiSplitSet.get(mss_id)

    from method_mhc_mkl import SequencesHandler
    from shogun.Distance import EuclidianDistance, HammingWordDistance
    from shogun.Features import StringCharFeatures, StringWordFeatures, PROTEIN
    from shogun.Clustering import Hierarchical
    from shogun.PreProc import SortWordString

    order = 1
    gap = 0
    reverse = False

    seq_handler = SequencesHandler()

    data = [seq_handler.get_seq(ss.dataset.organism) for ss in mss.split_sets]

    charfeat = StringCharFeatures(PROTEIN)
    charfeat.set_features(data)
    feats = StringWordFeatures(charfeat.get_alphabet())
    feats.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    preproc = SortWordString()
    preproc.init(feats)
    feats.add_preproc(preproc)
    feats.apply_preproc()

    use_sign = False

    distance = HammingWordDistance(feats, feats, use_sign)
    #distance = EuclidianDistance()

    merges = 4
    hierarchical = Hierarchical(merges, distance)
    hierarchical.train()

    hierarchical.get_merge_distances()
    hierarchical.get_cluster_pairs()

    return hierarchical
示例#17
0
def seqToWordFeatures(train,test,order=7,gap=10):

    charfeat_train, charfeat_test = seqToStringFeatures(train,test)

    reverse=False

    feats_train=StringWordFeatures(charfeat_train.get_alphabet())
    feats_train.obtain_from_char(charfeat_train, order-1, order, gap, reverse)
    preproc=SortWordString()
    preproc.init(feats_train)
    feats_train.add_preproc(preproc)
    feats_train.apply_preproc()

    reverse = False

    feats_test=StringWordFeatures(charfeat_test.get_alphabet())
    feats_test.obtain_from_char(charfeat_test, order-1, order, gap, reverse)
    feats_test.add_preproc(preproc)
    feats_test.apply_preproc()
    #pdb.set_trace()
    return feats_train, feats_test
示例#18
0
def non_redundant_word_features(feats, kmerlen):
    """convert the features from Shogun toolbox to non-redundant word features (handle reverse complements)
	Arguments:
	feats -- StringWordFeatures
	kmerlen -- integer, length of k-mer

	Return:
	StringWordFeatures after converting reverse complement k-mer ids
	"""

    rcmap = g_rcmap

    # the number of sequences, 1923 for CT18 window size 5000
    print 'The number of feature vectors: %s' % (feats.get_num_vectors())

    # one vector for each sequence
    for i in xrange(feats.get_num_vectors()):
        # a list of kmer id for each feature vector
        # some k-mer ids may be redundant
        nf = [rcmap[int(kmerid)] for kmerid in feats.get_feature_vector(i)]
        '''
		set_feature_vector (SGVector< ST > vector, int32_t num)
		'''
        feats.set_feature_vector(numpy.array(nf, numpy.dtype('u2')), i)

    # sort features by kmer id
    preproc = SortWordString()
    preproc.init(feats)
    try:
        feats.add_preproc(preproc)
        feats.apply_preproc()
    except AttributeError:
        feats.add_preprocessor(preproc)
        feats.apply_preprocessor()

    return feats
示例#19
0
def normal_word_feature(feats):
    preproc = SortWordString()
    preproc.init(feats)
    feats.add_preprocessor(preproc)
    feats.apply_preprocessor()
    return feats
示例#20
0
def tests_check_commwordkernel_memleak_modular(num, order, gap, reverse):
    import gc
    from shogun.Features import Alphabet, StringCharFeatures, StringWordFeatures, DNA
    from shogun.PreProc import SortWordString, MSG_DEBUG
    from shogun.Kernel import CommWordStringKernel, IdentityKernelNormalizer
    from numpy import mat

    POS = [
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT'
    ]
    NEG = [
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT'
    ]

    for i in xrange(10):
        alpha = Alphabet(DNA)
        traindat = StringCharFeatures(alpha)
        traindat.set_features(POS + NEG)
        trainudat = StringWordFeatures(traindat.get_alphabet())
        trainudat.obtain_from_char(traindat, order - 1, order, gap, reverse)
        #trainudat.io.set_loglevel(MSG_DEBUG)
        pre = SortWordString()
        #pre.io.set_loglevel(MSG_DEBUG)
        pre.init(trainudat)
        trainudat.add_preproc(pre)
        trainudat.apply_preproc()
        spec = CommWordStringKernel(10, False)
        spec.set_normalizer(IdentityKernelNormalizer())
        spec.init(trainudat, trainudat)
        K = spec.get_kernel_matrix()

    del POS
    del NEG
    del order
    del gap
    del reverse
    return K
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT']
order=7
gap=0
reverse=False

for i in xrange(10):
    alpha=Alphabet(DNA)
    traindat=StringCharFeatures(alpha)
    traindat.set_features(POS+NEG)
    trainudat=StringWordFeatures(traindat.get_alphabet());
    trainudat.obtain_from_char(traindat, order-1, order, gap, reverse)
    #trainudat.io.set_loglevel(MSG_DEBUG)
    pre = SortWordString()
    #pre.io.set_loglevel(MSG_DEBUG)
    pre.init(trainudat)
    trainudat.add_preproc(pre)
    trainudat.apply_preproc()
    spec = CommWordStringKernel(10, False)
    spec.set_normalizer(IdentityKernelNormalizer())
    spec.init(trainudat, trainudat)
    K=mat(spec.get_kernel_matrix())

del POS
del NEG
del order
del gap
del reverse
示例#22
0
def create_features(kname, examples, kparam, train_mode, preproc, seq_source, nuc_con):
    """Converts numpy arrays or sequences into shogun features"""

    if kname == 'gauss' or kname == 'linear' or kname == 'poly':
        examples = numpy.array(examples)
        feats = RealFeatures(examples)
        
    elif kname == 'wd' or kname == 'localalign' or kname == 'localimprove':
        if seq_source == 'dna': 
            examples = non_atcg_convert(examples, nuc_con)
            feats = StringCharFeatures(examples, DNA)
        elif seq_source == 'protein':
            examples = non_aminoacid_converter(examples, nuc_con) 
            feats = StringCharFeatures(examples, PROTEIN)
        else:
            sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n")
            sys.exit(-1)

    elif kname == 'spec' or kname == 'cumspec':
        if seq_source == 'dna':
            examples = non_atcg_convert(examples, nuc_con)
            feats = StringCharFeatures(examples, DNA) 
        elif seq_source == 'protein':    
            examples = non_aminoacid_converter(examples, nuc_con)
            feats = StringCharFeatures(examples, PROTEIN)
        else:
            sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n")
            sys.exit(-1)
       
        wf = StringUlongFeatures( feats.get_alphabet() )
        wf.obtain_from_char(feats, kparam['degree']-1, kparam['degree'], 0, kname=='cumspec')
        del feats

        if train_mode:
            preproc = SortUlongString()
            preproc.init(wf)
        wf.add_preproc(preproc)
        ret = wf.apply_preproc()
        #assert(ret)

        feats = wf
    elif kname == 'spec2' or kname == 'cumspec2':
        # spectrum kernel on two sequences
        feats = {}
        feats['combined'] = CombinedFeatures()

        reversed = kname=='cumspec2'

        (ex0,ex1) = zip(*examples)

        f0 = StringCharFeatures(list(ex0), DNA)
        wf = StringWordFeatures(f0.get_alphabet())
        wf.obtain_from_char(f0, kparam['degree']-1, kparam['degree'], 0, reversed)
        del f0

        if train_mode:
            preproc = SortWordString()
            preproc.init(wf)
        wf.add_preprocessor(preproc)
        ret = wf.apply_preprocessors()
        assert(ret)
        feats['combined'].append_feature_obj(wf)
        feats['f0'] = wf

        f1 = StringCharFeatures(list(ex1), DNA)
        wf = StringWordFeatures( f1.get_alphabet() )
        wf.obtain_from_char(f1, kparam['degree']-1, kparam['degree'], 0, reversed)
        del f1

        if train_mode:
            preproc = SortWordString()
            preproc.init(wf)
        wf.add_preproc(preproc)
        ret = wf.apply_preproc()
        assert(ret)
        feats['combined'].append_feature_obj(wf)
        feats['f1'] = wf

    else:
        print 'Unknown kernel %s' % kname
    
    return (feats,preproc)
def tests_check_commwordkernel_memleak_modular(num, order, gap, reverse):
	import gc
	from shogun.Features import Alphabet,StringCharFeatures,StringWordFeatures,DNA
	from shogun.PreProc import SortWordString, MSG_DEBUG
	from shogun.Kernel import CommWordStringKernel, IdentityKernelNormalizer
	from numpy import mat

	POS=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', 
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', 
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', 
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT']
	NEG=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', 
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', 
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', 
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT']

	for i in xrange(10):
		alpha=Alphabet(DNA)
		traindat=StringCharFeatures(alpha)
		traindat.set_features(POS+NEG)
		trainudat=StringWordFeatures(traindat.get_alphabet());
		trainudat.obtain_from_char(traindat, order-1, order, gap, reverse)
		#trainudat.io.set_loglevel(MSG_DEBUG)
		pre = SortWordString()
		#pre.io.set_loglevel(MSG_DEBUG)
		pre.init(trainudat)
		trainudat.add_preproc(pre)
		trainudat.apply_preproc()
		spec = CommWordStringKernel(10, False)
		spec.set_normalizer(IdentityKernelNormalizer())
		spec.init(trainudat, trainudat)
		K=spec.get_kernel_matrix()

	del POS
	del NEG
	del order
	del gap
	del reverse
	return K