예제 #1
0
def distribution_linearhmm_modular(fm_dna=traindna,
                                   order=3,
                                   gap=0,
                                   reverse=False):

    from modshogun import StringWordFeatures, StringCharFeatures, DNA
    from modshogun import LinearHMM

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_dna)
    feats = StringWordFeatures(charfeat.get_alphabet())
    feats.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    hmm = LinearHMM(feats)
    hmm.train()

    hmm.get_transition_probs()

    num_examples = feats.get_num_vectors()
    num_param = hmm.get_num_model_parameters()
    for i in range(num_examples):
        for j in range(num_param):
            hmm.get_log_derivative(j, i)

    out_likelihood = hmm.get_log_likelihood()
    out_sample = hmm.get_log_likelihood_sample()

    return hmm, out_likelihood, out_sample
def runShogunSVMDNASubsequenceStringKernel(train_xt, train_lt, test_xt):
    """
	run svm with spectrum kernel
	"""

    ##################################################
    # set up svm
    feats_train = StringCharFeatures(train_xt, DNA)
    feats_test = StringCharFeatures(test_xt, DNA)

    kernel = SubsequenceStringKernel(feats_train, feats_train, MAXLEN, DECAY)
    kernel.io.set_loglevel(MSG_DEBUG)
    kernel.init(feats_train, feats_train)

    # init kernel
    labels = BinaryLabels(train_lt)

    # run svm model
    print "Ready to train!"
    svm = LibSVM(SVMC, kernel, labels)
    svm.io.set_loglevel(MSG_DEBUG)
    svm.train()

    # predictions
    print "Making predictions!"
    out1DecisionValues = svm.apply(feats_train)
    out1 = out1DecisionValues.get_labels()
    kernel.init(feats_train, feats_test)
    out2DecisionValues = svm.apply(feats_test)
    out2 = out2DecisionValues.get_labels()

    return out1, out2, out1DecisionValues, out2DecisionValues
def runShogunSVMDNALinearStringKernel(train_xt, train_lt, test_xt):
    """
	run svm with spectrum kernel
	"""

    ##################################################
    # set up svm
    feats_train = StringCharFeatures(train_xt, DNA)
    feats_test = StringCharFeatures(test_xt, DNA)

    kernel = LinearStringKernel(feats_train, feats_train)
    kernel.io.set_loglevel(MSG_DEBUG)

    # init kernel
    labels = BinaryLabels(train_lt)

    # run svm model
    print "Ready to train!"
    svm = LibSVM(SVMC, kernel, labels)
    svm.io.set_loglevel(MSG_DEBUG)
    svm.train()

    # predictions
    print "Making predictions!"
    out1 = svm.apply(feats_train).get_labels()
    kernel.init(feats_train, feats_test)
    out2 = svm.apply(feats_test).get_labels()

    return out1, out2
def kernel_histogram_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,order=3,ppseudo_count=1,npseudo_count=1):

	from modshogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels
	from modshogun import HistogramWordStringKernel, AvgDiagKernelNormalizer
	from modshogun import PluginEstimate#, MSG_DEBUG

	charfeat=StringCharFeatures(DNA)
	#charfeat.io.set_loglevel(MSG_DEBUG)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, 0, False)

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, 0, False)

	pie=PluginEstimate(ppseudo_count,npseudo_count)
	labels=BinaryLabels(label_train_dna)
	pie.set_labels(labels)
	pie.set_features(feats_train)
	pie.train()

	kernel=HistogramWordStringKernel(feats_train, feats_train, pie)
	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	pie.set_features(feats_test)
	pie.apply().get_labels()
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def kernel_weighted_comm_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,order=3,gap=0,reverse=True ):
	from modshogun import WeightedCommWordStringKernel
	from modshogun import StringWordFeatures, StringCharFeatures, DNA
	from modshogun import SortWordString

	charfeat=StringCharFeatures(fm_train_dna, DNA)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(fm_test_dna, DNA)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	use_sign=False
	kernel=WeightedCommWordStringKernel(feats_train, feats_train, use_sign)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
예제 #6
0
def kernel_poly_match_word_string_modular(fm_train_dna=traindat,
                                          fm_test_dna=testdat,
                                          degree=2,
                                          inhomogene=True,
                                          order=3,
                                          gap=0,
                                          reverse=False):
    from modshogun import PolyMatchWordStringKernel
    from modshogun import StringWordFeatures, StringCharFeatures, DNA

    charfeat = StringCharFeatures(fm_train_dna, DNA)
    feats_train = StringWordFeatures(DNA)
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    charfeat = StringCharFeatures(fm_test_dna, DNA)
    feats_test = StringWordFeatures(DNA)
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    kernel = PolyMatchWordStringKernel(feats_train, feats_train, degree,
                                       inhomogene)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
예제 #7
0
파일: wdsvm.py 프로젝트: zhouyu/polyAcode
def cross_validation(X, Y, d, c, K):
    N = len(Y)
    n = N / K

    accuracy_list = []

    for k in range(0, K):
        print 'degree = %s\tC = %s\tcross_validation_iter = %s/%s' % (d, c,
                                                                      k + 1, K)
        sys.stdout.flush()

        X_test = list(X[k:k + n])
        Y_test = list(Y[k:k + n])
        X_train = []
        X_train.extend(X[:k])
        X_train.extend(X[k + n:])
        Y_train = []
        Y_train.extend(Y[:k])
        Y_train.extend(Y[k + n:])

        X_train = StringCharFeatures(X_train, DNA)
        X_test = StringCharFeatures(X_test, DNA)
        Y_train = BinaryLabels(np.array(Y_train, dtype=np.float64))
        Y_test = np.array(Y_test)

        args_tuple = (X_train, Y_train, X_test, Y_test, d, c)
        accuracy, Y_test_proba = svm_process(args_tuple)
        accuracy_list.append(accuracy)

    return np.array(accuracy_list).mean()
예제 #8
0
def runShogunSVMDNAWDNoPositionKernel(train_xt, train_lt, test_xt):
    """
	run svm with non-position WD kernel
	"""

    ##################################################
    # set up svm
    feats_train = StringCharFeatures(train_xt, DNA)
    feats_test = StringCharFeatures(test_xt, DNA)

    kernel = WeightedDegreeStringKernel(feats_train, feats_train, DEGREE)
    kernel.io.set_loglevel(MSG_DEBUG)

    weights=arange(1,DEGREE+1,dtype=double)[::-1]/ \
     sum(arange(1,DEGREE+1,dtype=double))
    kernel.set_wd_weights(weights)

    # init kernel
    labels = BinaryLabels(train_lt)

    # run svm model
    print "Ready to train!"
    svm = LibSVM(SVMC, kernel, labels)
    svm.io.set_loglevel(MSG_DEBUG)
    svm.train()

    # predictions
    print "Making predictions!"
    out1 = svm.apply(feats_train).get_labels()
    kernel.init(feats_train, feats_test)
    out2 = svm.apply(feats_test).get_labels()

    return out1, out2
def kernel_weighted_degree_string_modular(fm_train_dna=traindat,
                                          fm_test_dna=testdat,
                                          degree=20):
    from modshogun import StringCharFeatures, DNA
    from modshogun import WeightedDegreeStringKernel, MSG_DEBUG

    feats_train = StringCharFeatures(fm_train_dna, DNA)
    #feats_train.io.set_loglevel(MSG_DEBUG)
    feats_test = StringCharFeatures(fm_test_dna, DNA)

    kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree)

    from numpy import arange, double
    weights=arange(1,degree+1,dtype=double)[::-1]/ \
     sum(arange(1,degree+1,dtype=double))
    kernel.set_wd_weights(weights)
    #from numpy import ones,float64,int32
    #kernel.set_position_weights(ones(len(fm_train_dna[0]), dtype=float64))

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()

    #this is how to serializate the kernel
    #import pickle
    #pickle.dump(kernel, file('kernel_obj.dump','w'), protocol=2)
    #k=pickle.load(file('kernel_obj.dump','r'))

    return km_train, km_test, kernel
def runShogunSVMDNAWDKernel(train_xt, train_lt, test_xt):
    """
	run svm with string kernels
	"""

    ##################################################
    # set up svm
    feats_train = StringCharFeatures(train_xt, DNA)
    feats_test = StringCharFeatures(test_xt, DNA)

    kernel = WeightedDegreePositionStringKernel(feats_train, feats_train,
                                                DEGREE)
    kernel.io.set_loglevel(MSG_DEBUG)
    kernel.set_shifts(NUMSHIFTS * ones(len(train_xt[0]), dtype=int32))
    kernel.set_position_weights(ones(len(train_xt[0]), dtype=float64))

    # init kernel
    labels = BinaryLabels(train_lt)

    # run svm model
    print "Ready to train!"
    svm = LibSVM(SVMC, kernel, labels)
    svm.io.set_loglevel(MSG_DEBUG)
    svm.train()

    # predictions
    print "Making predictions!"
    out1DecisionValues = svm.apply(feats_train)
    out1 = out1DecisionValues.get_labels()
    kernel.init(feats_train, feats_test)
    out2DecisionValues = svm.apply(feats_test)
    out2 = out2DecisionValues.get_labels()

    return out1, out2, out1DecisionValues, out2DecisionValues
예제 #11
0
def kernel_combined_modular (fm_train_real=traindat,fm_test_real=testdat,fm_train_dna=traindna,fm_test_dna=testdna ):
	from modshogun import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel
	from modshogun import RealFeatures, StringCharFeatures, CombinedFeatures, DNA

	kernel=CombinedKernel()
	feats_train=CombinedFeatures()
	feats_test=CombinedFeatures()

	subkfeats_train=RealFeatures(fm_train_real)
	subkfeats_test=RealFeatures(fm_test_real)
	subkernel=GaussianKernel(10, 1.1)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	subkfeats_train=StringCharFeatures(fm_train_dna, DNA)
	subkfeats_test=StringCharFeatures(fm_test_dna, DNA)
	degree=3
	subkernel=FixedDegreeStringKernel(10, degree)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	subkfeats_train=StringCharFeatures(fm_train_dna, DNA)
	subkfeats_test=StringCharFeatures(fm_test_dna, DNA)
	subkernel=LocalAlignmentStringKernel(10)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	kernel.init(feats_train, feats_train)
	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
예제 #12
0
def kernel_match_word_string_modular(fm_train_dna=traindat,
                                     fm_test_dna=testdat,
                                     degree=3,
                                     scale=1.4,
                                     size_cache=10,
                                     order=3,
                                     gap=0,
                                     reverse=False):
    from modshogun import MatchWordStringKernel, AvgDiagKernelNormalizer
    from modshogun import StringWordFeatures, StringCharFeatures, DNA

    charfeat = StringCharFeatures(fm_train_dna, DNA)
    feats_train = StringWordFeatures(DNA)
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    charfeat = StringCharFeatures(fm_test_dna, DNA)
    feats_test = StringWordFeatures(DNA)
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    kernel = MatchWordStringKernel(size_cache, degree)
    kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
    kernel.init(feats_train, feats_train)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
def kernel_salzberg_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,
order=3,gap=0,reverse=False):
	from modshogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels
	from modshogun import SalzbergWordStringKernel
	from modshogun import PluginEstimate

	charfeat=StringCharFeatures(fm_train_dna, DNA)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	charfeat=StringCharFeatures(fm_test_dna, DNA)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	pie=PluginEstimate()
	labels=BinaryLabels(label_train_dna)
	pie.set_labels(labels)
	pie.set_features(feats_train)
	pie.train()

	kernel=SalzbergWordStringKernel(feats_train, feats_train, pie, labels)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	pie.set_features(feats_test)
	pie.apply().get_labels()
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
예제 #14
0
def distribution_hmm_modular(fm_cube, N, M, pseudo, order, gap, reverse, num_examples):
	from modshogun import StringWordFeatures, StringCharFeatures, CUBE
	from modshogun import HMM, BW_NORMAL

	charfeat=StringCharFeatures(CUBE)
	charfeat.set_features(fm_cube)
	feats=StringWordFeatures(charfeat.get_alphabet())
	feats.obtain_from_char(charfeat, order-1, order, gap, reverse)

	hmm=HMM(feats, N, M, pseudo)
	hmm.train()
	hmm.baum_welch_viterbi_train(BW_NORMAL)

	num_examples=feats.get_num_vectors()
	num_param=hmm.get_num_model_parameters()
	for i in range(num_examples):
		for j in range(num_param):
			hmm.get_log_derivative(j, i)

	best_path=0
	best_path_state=0
	for i in range(num_examples):
		best_path+=hmm.best_path(i)
		for j in range(N):
			best_path_state+=hmm.get_best_path_state(i, j)

	lik_example = hmm.get_log_likelihood()
	lik_sample = hmm.get_log_likelihood_sample()

	return lik_example, lik_sample, hmm
예제 #15
0
def classifier_domainadaptationsvm_modular (fm_train_dna=traindna,fm_test_dna=testdna, \
                                                label_train_dna=label_traindna, \
                                               label_test_dna=label_testdna,fm_train_dna2=traindna2,fm_test_dna2=testdna2, \
                                               label_train_dna2=label_traindna2,label_test_dna2=label_testdna2,C=1,degree=3):




	feats_train = StringCharFeatures(fm_train_dna, DNA)
	feats_test = StringCharFeatures(fm_test_dna, DNA)
	kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree)
	labels = BinaryLabels(label_train_dna)
	svm = SVMLight(C, kernel, labels)
	svm.train()
	#svm.io.set_loglevel(MSG_DEBUG)

	#####################################

	#print("obtaining DA SVM from previously trained SVM")

	feats_train2 = StringCharFeatures(fm_train_dna, DNA)
	feats_test2 = StringCharFeatures(fm_test_dna, DNA)
	kernel2 = WeightedDegreeStringKernel(feats_train, feats_train, degree)
	labels2 = BinaryLabels(label_train_dna)

	# we regularize against the previously obtained solution
	dasvm = DomainAdaptationSVM(C, kernel2, labels2, svm, 1.0)
	dasvm.train()

	out = dasvm.apply_binary(feats_test2)

	return out #,dasvm TODO
예제 #16
0
def classifier_ssk_modular(fm_train_dna=traindat,
                           fm_test_dna=testdat,
                           label_train_dna=label_traindat,
                           C=1,
                           maxlen=1,
                           decay=1):
    from modshogun import StringCharFeatures, BinaryLabels
    from modshogun import LibSVM, StringSubsequenceKernel, DNA
    from modshogun import ErrorRateMeasure

    feats_train = StringCharFeatures(fm_train_dna, DNA)
    feats_test = StringCharFeatures(fm_test_dna, DNA)
    labels = BinaryLabels(label_train_dna)
    kernel = StringSubsequenceKernel(feats_train, feats_train, maxlen, decay)

    svm = LibSVM(C, kernel, labels)
    svm.train()

    out = svm.apply(feats_train)
    evaluator = ErrorRateMeasure()
    trainerr = evaluator.evaluate(out, labels)
    # print(trainerr)

    kernel.init(feats_train, feats_test)
    predicted_labels = svm.apply(feats_test).get_labels()
    # print predicted_labels

    return predicted_labels
def distance_canberraword_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False):
	from modshogun import StringCharFeatures, StringWordFeatures, DNA
	from modshogun import SortWordString
	from modshogun import CanberraWordDistance

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	distance=CanberraWordDistance(feats_train, feats_train)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return distance,dm_train,dm_test
def distribution_linearhmm_modular (fm_dna=traindna,order=3,gap=0,reverse=False):

	from modshogun import StringWordFeatures, StringCharFeatures, DNA
	from modshogun import LinearHMM

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_dna)
	feats=StringWordFeatures(charfeat.get_alphabet())
	feats.obtain_from_char(charfeat, order-1, order, gap, reverse)

	hmm=LinearHMM(feats)
	hmm.train()

	hmm.get_transition_probs()

	num_examples=feats.get_num_vectors()
	num_param=hmm.get_num_model_parameters()
	for i in range(num_examples):
		for j in range(num_param):
			hmm.get_log_derivative(j, i)

	out_likelihood = hmm.get_log_likelihood()
	out_sample = hmm.get_log_likelihood_sample()

	return hmm,out_likelihood ,out_sample
def kernel_comm_word_string_modular(fm_train_dna=traindat,
                                    fm_test_dna=testdat,
                                    order=3,
                                    gap=0,
                                    reverse=False,
                                    use_sign=False):

    from modshogun import CommWordStringKernel
    from modshogun import StringWordFeatures, StringCharFeatures, DNA
    from modshogun import SortWordString

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_train_dna)
    feats_train = StringWordFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    preproc = SortWordString()
    preproc.init(feats_train)
    feats_train.add_preprocessor(preproc)
    feats_train.apply_preprocessor()

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_test_dna)
    feats_test = StringWordFeatures(charfeat.get_alphabet())
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    feats_test.add_preprocessor(preproc)
    feats_test.apply_preprocessor()

    kernel = CommWordStringKernel(feats_train, feats_train, use_sign)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
예제 #20
0
def runShogunSVMDNACombinedSpectrumKernel(train_xt, train_lt, test_xt):
	"""
	run svm with combined spectrum kernel
	"""

    ##################################################
    # set up svm
	kernel=CombinedKernel()
	feats_train=CombinedFeatures()
	feats_test=CombinedFeatures()
	
	for K in KList:
		# Iterate through the K's and make a spectrum kernel for each
		charfeat_train = StringCharFeatures(train_xt, DNA)
		current_feats_train = StringWordFeatures(DNA)
		current_feats_train.obtain_from_char(charfeat_train, K-1, K, GAP, False)
		preproc=SortWordString()
		preproc.init(current_feats_train)
		current_feats_train.add_preprocessor(preproc)
		current_feats_train.apply_preprocessor()
		feats_train.append_feature_obj(current_feats_train)
	
		charfeat_test = StringCharFeatures(test_xt, DNA)
		current_feats_test=StringWordFeatures(DNA)
		current_feats_test.obtain_from_char(charfeat_test, K-1, K, GAP, False)
		current_feats_test.add_preprocessor(preproc)
		current_feats_test.apply_preprocessor()
		feats_test.append_feature_obj(current_feats_test)
	
		current_kernel=CommWordStringKernel(10, False)
		kernel.append_kernel(current_kernel)
	
	kernel.io.set_loglevel(MSG_DEBUG)

    # init kernel
	labels = BinaryLabels(train_lt)
	
	# run svm model
	print "Ready to train!"
	kernel.init(feats_train, feats_train)
	svm=LibSVM(SVMC, kernel, labels)
	svm.io.set_loglevel(MSG_DEBUG)
	svm.train()

	# predictions
	print "Making predictions!"
	out1DecisionValues = svm.apply(feats_train)
	out1=out1DecisionValues.get_labels()
	kernel.init(feats_train, feats_test)
	out2DecisionValues = svm.apply(feats_test)
	out2=out2DecisionValues.get_labels()

	return out1,out2,out1DecisionValues,out2DecisionValues
def get_wd_features(data, feat_type="dna"):
    """
    create feature object for wdk
    """
    if feat_type == "dna":
        feat = StringCharFeatures(DNA)
    elif feat_type == "protein":
        feat = StringCharFeatures(PROTEIN)
    else:
        raise Exception("unknown feature type")
    feat.set_features(data)

    return feat
def kernel_linear_string_modular (fm_train_dna=traindat,fm_test_dna=testdat):
	from modshogun import StringCharFeatures, DNA
	from modshogun import LinearStringKernel

	feats_train=StringCharFeatures(fm_train_dna, DNA)
	feats_test=StringCharFeatures(fm_test_dna, DNA)

	kernel=LinearStringKernel(feats_train, feats_train)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def kernel_poly_match_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,degree=3,inhomogene=False):
	from modshogun import PolyMatchStringKernel
	from modshogun import StringCharFeatures, DNA

	feats_train=StringCharFeatures(fm_train_dna, DNA)
	feats_test=StringCharFeatures(fm_train_dna, DNA)

	kernel=PolyMatchStringKernel(feats_train, feats_train, degree, inhomogene)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def get_spectrum_features(data, order=3, gap=0, reverse=True):
    """
	create feature object used by spectrum kernel
	"""

    charfeat = StringCharFeatures(data, PROTEIN)
    feat = StringWordFeatures(charfeat.get_alphabet())
    feat.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    preproc = SortWordString()
    preproc.init(feat)
    feat.add_preprocessor(preproc)
    feat.apply_preprocessor()

    return feat
def get_spectrum_features(data, order=3, gap=0, reverse=True):
    """
    create feature object used by spectrum kernel
    """

    charfeat = StringCharFeatures(data, DNA)
    feat = StringWordFeatures(charfeat.get_alphabet())
    feat.obtain_from_char(charfeat, order-1, order, gap, reverse)
    preproc = SortWordString()
    preproc.init(feat)
    feat.add_preprocessor(preproc)
    feat.apply_preprocessor()

    return feat
def classifier_svmlight_linear_term_modular (fm_train_dna=traindna,fm_test_dna=testdna, \
                                                label_train_dna=label_traindna,degree=3, \
                                                C=10,epsilon=1e-5,num_threads=1):

    from modshogun import StringCharFeatures, BinaryLabels, DNA
    from modshogun import WeightedDegreeStringKernel
    try:
        from modshogun import SVMLight
    except ImportError:
        print("SVMLight is not available")
        exit(0)

    feats_train = StringCharFeatures(DNA)
    feats_train.set_features(fm_train_dna)
    feats_test = StringCharFeatures(DNA)
    feats_test.set_features(fm_test_dna)

    kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree)

    labels = BinaryLabels(label_train_dna)

    svm = SVMLight(C, kernel, labels)
    svm.set_qpsize(3)
    svm.set_linear_term(
        -numpy.array([1, 2, 3, 4, 5, 6, 7, 8, 7, 6], dtype=numpy.double))
    svm.set_epsilon(epsilon)
    svm.parallel.set_num_threads(num_threads)
    svm.train()

    kernel.init(feats_train, feats_test)
    out = svm.apply().get_labels()
    return out, kernel
def preprocessor_sortulongstring_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False,use_sign=False):

	from modshogun import CommUlongStringKernel
	from modshogun import StringCharFeatures, StringUlongFeatures, DNA
	from modshogun import SortUlongString


	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringUlongFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringUlongFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	preproc=SortUlongString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	kernel=CommUlongStringKernel(feats_train, feats_train, use_sign)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def distance_hammingword_modular (fm_train_dna=traindna,fm_test_dna=testdna,
		fm_test_real=testdat,order=3,gap=0,reverse=False,use_sign=False):

	from modshogun import StringCharFeatures, StringWordFeatures, DNA
	from modshogun import SortWordString
	from modshogun import HammingWordDistance

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	distance=HammingWordDistance(feats_train, feats_train, use_sign)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return distance,dm_train,dm_test
예제 #29
0
def classifier_svmlight_modular(fm_train_dna=traindat,
                                fm_test_dna=testdat,
                                label_train_dna=label_traindat,
                                C=1.2,
                                epsilon=1e-5,
                                num_threads=1):
    from modshogun import StringCharFeatures, BinaryLabels, DNA
    from modshogun import WeightedDegreeStringKernel
    try:
        from modshogun import SVMLight
    except ImportError:
        print('No support for SVMLight available.')
        return

    feats_train = StringCharFeatures(DNA)
    feats_train.set_features(fm_train_dna)
    feats_test = StringCharFeatures(DNA)
    feats_test.set_features(fm_test_dna)
    degree = 20

    kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree)

    labels = BinaryLabels(label_train_dna)

    svm = SVMLight(C, kernel, labels)
    svm.set_epsilon(epsilon)
    svm.parallel.set_num_threads(num_threads)
    svm.train()

    kernel.init(feats_train, feats_test)
    svm.apply().get_labels()
    return kernel
예제 #30
0
def get_kernel_mat(fm_train_dna,
                   fm_test_dna,
                   N,
                   M,
                   pseudo=1e-1,
                   order=1,
                   gap=0,
                   reverse=False):

    # train HMM for positive class
    print "hmm training"
    charfeat = StringCharFeatures(fm_train_dna, DNA)
    #charfeat.io.set_loglevel(MSG_DEBUG)
    hmm_train = StringWordFeatures(charfeat.get_alphabet())
    hmm_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    pos = HMM(hmm_train, N, M, pseudo)
    pos.baum_welch_viterbi_train(BW_NORMAL)
    neg = HMM(pos)

    print "Kernel training data"
    charfeat = StringCharFeatures(fm_train_dna, DNA)
    wordfeats_train = StringWordFeatures(charfeat.get_alphabet())
    wordfeats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    print "Kernel testing data"
    charfeat = StringCharFeatures(fm_test_dna, DNA)
    wordfeats_test = StringWordFeatures(charfeat.get_alphabet())
    wordfeats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    print "get kernel on training data"
    pos.set_observations(wordfeats_train)
    neg.set_observations(wordfeats_train)
    feats_train = FKFeatures(10, pos, neg)
    feats_train.set_opt_a(-1)  #estimate prior

    print 'getting feature matrix'
    v0 = feats_train.get_feature_vector(0)
    v1 = feats_train.get_feature_vector(1)
    print np.dot(v0, v1)
    kernel = LinearKernel(feats_train, feats_train)
    #kernel=PolyKernel(feats_train, feats_train, *kargs)
    km_train = kernel.get_kernel_matrix()
    print km_train.shape, km_train[0, 1]

    print "get kernel on testing data"
    pos_clone = HMM(pos)
    neg_clone = HMM(neg)
    pos_clone.set_observations(wordfeats_test)
    neg_clone.set_observations(wordfeats_test)
    feats_test = FKFeatures(10, pos_clone, neg_clone)
    feats_test.set_a(feats_train.get_a())  #use prior from training data
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
def make_string_feature (astringv, start=1, order=8, gap=0, reverse=False):
    from modshogun import StringUlongFeatures, StringCharFeatures, RAWBYTE 
    from modshogun import SortUlongString


    charfeat=StringCharFeatures(astringv, RAWBYTE)

    feats_train=StringUlongFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, start, order, gap, reverse)
    preproc=SortUlongString()
    preproc.init(feats_train)
    feats_train.add_preprocessor(preproc)
    feats_train.apply_preprocessor()

    return feats_train
def kernel_locality_improved_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,length=5,inner_degree=5,outer_degree=7):

	from modshogun import StringCharFeatures, DNA
	from modshogun import LocalityImprovedStringKernel

	feats_train=StringCharFeatures(fm_train_dna, DNA)
	feats_test=StringCharFeatures(fm_test_dna, DNA)

	kernel=LocalityImprovedStringKernel(
		feats_train, feats_train, length, inner_degree, outer_degree)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
 def get_k_nearest_docs(self, docs_list, doc_index, k, keywords_list):
     cleaned_doc_list = []
     for doc in docs_list:
         cleaned_doc_list.append(self.get_clean_text(doc))
     features = StringCharFeatures(cleaned_doc_list, RAWBYTE)
     # print(dir(features))
     n = 7
     lambda_sym = 0.2
     sk = SubsequenceStringKernel(features, features, n, lambda_sym)
     sim_mat = sk.get_kernel_matrix()
     # print("sim_mat.shape : ", sim_mat.shape)
     similarity_scores_with_indices = [
         (sim_score, i)
         for i, sim_score in list(enumerate(sim_mat[doc_index]))
     ]
     k_nearest_docs_with_indices = sorted(similarity_scores_with_indices,
                                          reverse=True)[:k]
     # print("k_nearest_docs_with_indices : ", k_nearest_docs_with_indices)
     k_nearest_docs = [
         docs_list[i] for score, i in k_nearest_docs_with_indices
     ]
     k_nearest_doc_scores = np.array([
         score for score, i in k_nearest_docs_with_indices
     ]).reshape(k, 1, 1)
     return k_nearest_docs, k_nearest_doc_scores
예제 #34
0
def kernel_fixed_degree_string_modular(fm_train_dna=traindat,
                                       fm_test_dna=testdat,
                                       degree=3):
    from modshogun import StringCharFeatures, DNA
    from modshogun import FixedDegreeStringKernel

    feats_train = StringCharFeatures(fm_train_dna, DNA)
    feats_test = StringCharFeatures(fm_test_dna, DNA)

    kernel = FixedDegreeStringKernel(feats_train, feats_train, degree)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()

    return km_train, km_test, kernel
def features_hasheddocdot_modular(strings):
    from modshogun import StringCharFeatures, RAWBYTE
    from modshogun import HashedDocDotFeatures
    from modshogun import NGramTokenizer
    from numpy import array

    #create string features
    f = StringCharFeatures(strings, RAWBYTE)

    #set the number of bits of the target dimension
    #means a dim of size 2^5=32
    num_bits = 5

    #create the ngram tokenizer of size 8 to parse the strings
    tokenizer = NGramTokenizer(8)

    #normalize results
    normalize = True

    #create HashedDocDot features
    hddf = HashedDocDotFeatures(num_bits, f, tokenizer, normalize)

    #should expect 32
    #print('Feature space dimensionality is', hddf.get_dim_feature_space())

    #print('Self dot product of string 0', hddf.dot(0, hddf, 0))

    return hddf
def runShogunOneClassSVMDNASpectrumKernel(train_xt, train_lt, test_xt):
	"""
	run svm with spectrum kernel
	"""

    ##################################################
    # set up svr
	charfeat_train = StringCharFeatures(train_xt, DNA)
	feats_train = StringWordFeatures(DNA)
	feats_train.obtain_from_char(charfeat_train, K-1, K, GAP, False)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()
	
	charfeat_test = StringCharFeatures(test_xt, DNA)
	feats_test=StringWordFeatures(DNA)
	feats_test.obtain_from_char(charfeat_test, K-1, K, GAP, False)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()
	
	kernel=CommWordStringKernel(feats_train, feats_train, False)
	kernel.io.set_loglevel(MSG_DEBUG)

    # init kernel
	labels = BinaryLabels(train_lt)
	
	# run svm model
	print "Ready to train!"
	svm=LibSVMOneClass(SVMC, kernel)
	svm.set_epsilon(EPSILON)
	svm.train()


	# predictions
	print "Making predictions!"
	out1DecisionValues = svm.apply(feats_train)
	out1=out1DecisionValues.get_labels()
	kernel.init(feats_train, feats_test)
	out2DecisionValues = svm.apply(feats_test)
	out2=out2DecisionValues.get_labels()


#	predictions = svm.apply(feats_test)
#	return predictions, svm, predictions.get_labels()

	return out1,out2,out1DecisionValues,out2DecisionValues
예제 #37
0
def distribution_ppwm_modular(fm_dna=traindna, order=3):
    from modshogun import StringByteFeatures, StringCharFeatures, DNA
    from modshogun import PositionalPWM

    from numpy import array, e, log, exp

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_dna)
    feats = StringByteFeatures(charfeat.get_alphabet())
    feats.obtain_from_char(charfeat, order - 1, order, 0, False)

    L = 20
    k = 3
    sigma = 1
    mu = 4

    ppwm = PositionalPWM()
    ppwm.set_sigma(sigma)
    ppwm.set_mean(mu)
    pwm = array([[0.0, 0.5, 0.1, 1.0], [0.0, 0.5, 0.5, 0.0],
                 [1.0, 0.0, 0.4, 0.0], [0.0, 0.0, 0.0, 0.0]])
    pwm = array([[0.01, 0.09, 0.1], [0.09, 0.01, 0.1], [0.85, 0.4, 0.1],
                 [0.05, 0.5, 0.7]])

    ppwm.set_pwm(log(pwm))
    #print(ppwm.get_pwm())
    ppwm.compute_w(L)
    w = ppwm.get_w()
    #print(w)
    #from pylab import *
    #figure(1)
    #pcolor(exp(w))
    #pcolor(w)
    #colorbar()

    #figure(2)
    ppwm.compute_scoring(1)
    u = ppwm.get_scoring(0)
    #pcolor(exp(u))
    #show()

    #ppwm=PositionalPWM(feats)
    #ppwm.train()

    #out_likelihood = histo.get_log_likelihood()
    #out_sample = histo.get_log_likelihood_sample()
    return w, u
예제 #38
0
def kernel_distantsegments_modular(fm_train_dna=traindat,
                                   fm_test_dna=testdat,
                                   delta=5,
                                   theta=5):
    from modshogun import StringCharFeatures, DNA
    from modshogun import DistantSegmentsKernel

    feats_train = StringCharFeatures(fm_train_dna, DNA)
    feats_test = StringCharFeatures(fm_test_dna, DNA)

    kernel = DistantSegmentsKernel(feats_train, feats_train, 10, delta, theta)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()

    return km_train, km_test, kernel
def distribution_ppwm_modular(fm_dna=traindna, order=3):
    from modshogun import StringByteFeatures, StringCharFeatures, DNA
    from modshogun import PositionalPWM

    from numpy import array, e, log, exp

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_dna)
    feats = StringByteFeatures(charfeat.get_alphabet())
    feats.obtain_from_char(charfeat, order - 1, order, 0, False)

    L = 20
    k = 3
    sigma = 1
    mu = 4

    ppwm = PositionalPWM()
    ppwm.set_sigma(sigma)
    ppwm.set_mean(mu)
    pwm = array([[0.0, 0.5, 0.1, 1.0], [0.0, 0.5, 0.5, 0.0], [1.0, 0.0, 0.4, 0.0], [0.0, 0.0, 0.0, 0.0]])
    pwm = array([[0.01, 0.09, 0.1], [0.09, 0.01, 0.1], [0.85, 0.4, 0.1], [0.05, 0.5, 0.7]])

    ppwm.set_pwm(log(pwm))
    # print(ppwm.get_pwm())
    ppwm.compute_w(L)
    w = ppwm.get_w()
    # print(w)
    # from pylab import *
    # figure(1)
    # pcolor(exp(w))
    # pcolor(w)
    # colorbar()

    # figure(2)
    ppwm.compute_scoring(1)
    u = ppwm.get_scoring(0)
    # pcolor(exp(u))
    # show()

    # ppwm=PositionalPWM(feats)
    # ppwm.train()

    # out_likelihood = histo.get_log_likelihood()
    # out_sample = histo.get_log_likelihood_sample()
    return w, u
def features_string_char_modular (strings):
	from modshogun import StringCharFeatures, RAWBYTE
	from numpy import array

	#create string features
	f=StringCharFeatures(strings, RAWBYTE)

	#and output several stats
	#print("max string length", f.get_max_vector_length())
	#print("number of strings", f.get_num_vectors())
	#print("length of first string", f.get_vector_length(0))
	#print("string[5]", ''.join(f.get_feature_vector(5)))
	#print("strings", f.get_features())

	#replace string 0
	f.set_feature_vector(array(['t','e','s','t']), 0)

	#print("strings", f.get_features())
	return f.get_features(), f
예제 #41
0
	def get_predictions(self, sequence, positions):

		seqlen=self.window_right+self.window_left+2
		num=len(positions)

		testdat = []

		for j in xrange(num):
			i=positions[j] - self.offset ;
			s=sequence[i-self.window_left:i+self.window_right+2]
			testdat.append(s)

		t=StringCharFeatures(DNA)
		t.set_string_features(testdat)

		self.wd_kernel.init(self.traindat, t)
		l=self.svm.classify().get_labels()
		sys.stderr.write("\n...done...\n")
		return l
def kernel_fisher_modular (fm_train_dna=traindat, fm_test_dna=testdat,
		label_train_dna=label_traindat,
		N=1,M=4,pseudo=1e-1,order=1,gap=0,reverse=False,
		kargs=[1,False,True]):

	from modshogun import StringCharFeatures, StringWordFeatures, FKFeatures, DNA
	from modshogun import PolyKernel
	from modshogun import HMM, BW_NORMAL#, MSG_DEBUG

	# train HMM for positive class
	charfeat=StringCharFeatures(fm_hmm_pos, DNA)
	#charfeat.io.set_loglevel(MSG_DEBUG)
	hmm_pos_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	pos=HMM(hmm_pos_train, N, M, pseudo)
	pos.baum_welch_viterbi_train(BW_NORMAL)

	# train HMM for negative class
	charfeat=StringCharFeatures(fm_hmm_neg, DNA)
	hmm_neg_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	neg=HMM(hmm_neg_train, N, M, pseudo)
	neg.baum_welch_viterbi_train(BW_NORMAL)

	# Kernel training data
	charfeat=StringCharFeatures(fm_train_dna, DNA)
	wordfeats_train=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	# Kernel testing data
	charfeat=StringCharFeatures(fm_test_dna, DNA)
	wordfeats_test=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	# get kernel on training data
	pos.set_observations(wordfeats_train)
	neg.set_observations(wordfeats_train)
	feats_train=FKFeatures(10, pos, neg)
	feats_train.set_opt_a(-1) #estimate prior
	kernel=PolyKernel(feats_train, feats_train, *kargs)
	km_train=kernel.get_kernel_matrix()

	# get kernel on testing data
	pos_clone=HMM(pos)
	neg_clone=HMM(neg)
	pos_clone.set_observations(wordfeats_test)
	neg_clone.set_observations(wordfeats_test)
	feats_test=FKFeatures(10, pos_clone, neg_clone)
	feats_test.set_a(feats_train.get_a()) #use prior from training data
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def features_string_file_modular (directory, fname):
	from modshogun import StringCharFeatures, RAWBYTE
	from modshogun import CSVFile

	# load features from directory
	f=StringCharFeatures(RAWBYTE)
	f.load_from_directory(directory)

	#and output several stats
	#print("max string length", f.get_max_vector_length())
	#print("number of strings", f.get_num_vectors())
	#print("length of first string", f.get_vector_length(0))
	#print("str[0,0:3]", f.get_feature(0,0), f.get_feature(0,1), f.get_feature(0,2))
	#print("len(str[0])", f.get_vector_length(0))
	#print("str[0]", f.get_feature_vector(0))

	#or load features from file (one string per line)
	fil=CSVFile(fname)
	f.load(fil)
	#print(f.get_features())

	#or load fasta file
	#f.load_fasta('fasta.fa')
	#print(f.get_features())
	return f.get_features(), f
def classifier_svmlight_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,C=1.2,epsilon=1e-5,num_threads=1):
	from modshogun import StringCharFeatures, BinaryLabels, DNA
	from modshogun import WeightedDegreeStringKernel
	try:
		from modshogun import SVMLight
	except ImportError:
		print('No support for SVMLight available.')
		return

	feats_train=StringCharFeatures(DNA)
	feats_train.set_features(fm_train_dna)
	feats_test=StringCharFeatures(DNA)
	feats_test.set_features(fm_test_dna)
	degree=20

	kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)

	labels=BinaryLabels(label_train_dna)

	svm=SVMLight(C, kernel, labels)
	svm.set_epsilon(epsilon)
	svm.parallel.set_num_threads(num_threads)
	svm.train()

	kernel.init(feats_train, feats_test)
	svm.apply().get_labels()
	return kernel
예제 #45
0
def kernel_top_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,pseudo=1e-1,
	order=1,gap=0,reverse=False,kargs=[1, False, True]):
	from modshogun import StringCharFeatures, StringWordFeatures, TOPFeatures, DNA
	from modshogun import PolyKernel
	from modshogun import HMM, BW_NORMAL

	N=1 # toy HMM with 1 state
	M=4 # 4 observations -> DNA


	# train HMM for positive class
	charfeat=StringCharFeatures(fm_hmm_pos, DNA)
	hmm_pos_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	pos=HMM(hmm_pos_train, N, M, pseudo)
	pos.baum_welch_viterbi_train(BW_NORMAL)

	# train HMM for negative class
	charfeat=StringCharFeatures(fm_hmm_neg, DNA)
	hmm_neg_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	neg=HMM(hmm_neg_train, N, M, pseudo)
	neg.baum_welch_viterbi_train(BW_NORMAL)

	# Kernel training data
	charfeat=StringCharFeatures(fm_train_dna, DNA)
	wordfeats_train=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	# Kernel testing data
	charfeat=StringCharFeatures(fm_test_dna, DNA)
	wordfeats_test=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	# get kernel on training data
	pos.set_observations(wordfeats_train)
	neg.set_observations(wordfeats_train)
	feats_train=TOPFeatures(10, pos, neg, False, False)
	kernel=PolyKernel(feats_train, feats_train, *kargs)
	km_train=kernel.get_kernel_matrix()

	# get kernel on testing data
	pos_clone=HMM(pos)
	neg_clone=HMM(neg)
	pos_clone.set_observations(wordfeats_test)
	neg_clone.set_observations(wordfeats_test)
	feats_test=TOPFeatures(10, pos_clone, neg_clone, False, False)
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def classifier_svmlight_linear_term_modular (fm_train_dna=traindna,fm_test_dna=testdna, \
                                                label_train_dna=label_traindna,degree=3, \
                                                C=10,epsilon=1e-5,num_threads=1):

    from modshogun import StringCharFeatures, BinaryLabels, DNA
    from modshogun import WeightedDegreeStringKernel
    try:
    	from modshogun import SVMLight
    except ImportError:
    	print("SVMLight is not available")
    	exit(0)

    feats_train=StringCharFeatures(DNA)
    feats_train.set_features(fm_train_dna)
    feats_test=StringCharFeatures(DNA)
    feats_test.set_features(fm_test_dna)

    kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)

    labels=BinaryLabels(label_train_dna)

    svm=SVMLight(C, kernel, labels)
    svm.set_qpsize(3)
    svm.set_linear_term(-numpy.array([1,2,3,4,5,6,7,8,7,6], dtype=numpy.double));
    svm.set_epsilon(epsilon)
    svm.parallel.set_num_threads(num_threads)
    svm.train()

    kernel.init(feats_train, feats_test)
    out = svm.apply().get_labels()
    return out,kernel
예제 #47
0
	def get_predictions_from_seqdict(self, seqdic, site):
		""" we need to generate a huge test features object 
			containing all locations found in each seqdict-sequence
			and each location (this is necessary to efficiently
			(==fast,low memory) compute the splice outputs
		"""

		seqlen=self.window_right+self.window_left+2

		num=0
		for s in seqdic:
			num+= len(s.preds[site].positions)

		testdat = []

		for s in seqdic:
			sequence=s.seq
			positions=s.preds[site].positions
			for j in xrange(len(positions)):
				i=positions[j] - self.offset
				s=sequence[i-self.window_left:i+self.window_right+2]
				testdat.append(s)

		t=StringCharFeatures(DNA)
		t.set_string_features(testdat)

		self.wd_kernel.init(self.traindat, t)
		l=self.svm.classify().get_labels()
		sys.stderr.write("\n...done...\n")

		k=0
		for s in seqdic:
			num=len(s.preds[site].positions)
			scores= num * [0]
			for j in xrange(num):
				scores[j]=l[k]
				k+=1
			s.preds[site].set_scores(scores)
def distribution_histogram_modular(fm_dna=traindna, order=3, gap=0, reverse=False):
    from modshogun import StringWordFeatures, StringCharFeatures, DNA
    from modshogun import Histogram

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_dna)
    feats = StringWordFeatures(charfeat.get_alphabet())
    feats.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    histo = Histogram(feats)
    histo.train()

    histo.get_histogram()

    num_examples = feats.get_num_vectors()
    num_param = histo.get_num_model_parameters()
    # for i in xrange(num_examples):
    # 	for j in xrange(num_param):
    # 		histo.get_log_derivative(j, i)

    out_likelihood = histo.get_log_likelihood()
    out_sample = histo.get_log_likelihood_sample()
    return histo, out_sample, out_likelihood
def distance_manhattenword_modular (train_fname=traindna,test_fname=testdna,order=3,gap=0,reverse=False):
	from modshogun import StringCharFeatures, StringWordFeatures, DNA
	from modshogun import SortWordString, ManhattanWordDistance, CSVFile

	charfeat=StringCharFeatures(CSVFile(train_fname), DNA)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(CSVFile(test_fname), DNA)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	distance=ManhattanWordDistance(feats_train, feats_train)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return dm_train,dm_test
예제 #50
0
def get_kernel_mat(fm_train_dna, fm_test_dna, N, M,
		pseudo=1e-1,order=1,gap=0,reverse=False):

	# train HMM for positive class
	print "hmm training"
	charfeat=StringCharFeatures(fm_train_dna, DNA)
	#charfeat.io.set_loglevel(MSG_DEBUG)
	hmm_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	pos=HMM(hmm_train, N, M, pseudo)
	pos.baum_welch_viterbi_train(BW_NORMAL)
	neg = HMM(pos)

	print "Kernel training data"
	charfeat=StringCharFeatures(fm_train_dna, DNA)
	wordfeats_train=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	print "Kernel testing data"
	charfeat=StringCharFeatures(fm_test_dna, DNA)
	wordfeats_test=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	print "get kernel on training data"
	pos.set_observations(wordfeats_train)
	neg.set_observations(wordfeats_train)
	feats_train=FKFeatures(10, pos, neg)
	feats_train.set_opt_a(-1) #estimate prior
	
	print 'getting feature matrix'
	v0 = feats_train.get_feature_vector(0)
	v1 = feats_train.get_feature_vector(1)
	print np.dot(v0, v1)
	kernel=LinearKernel(feats_train, feats_train)
	#kernel=PolyKernel(feats_train, feats_train, *kargs)
	km_train=kernel.get_kernel_matrix()
	print km_train.shape, km_train[0, 1]

	print "get kernel on testing data"
	pos_clone=HMM(pos)
	neg_clone=HMM(neg)
	pos_clone.set_observations(wordfeats_test)
	neg_clone.set_observations(wordfeats_test)
	feats_test=FKFeatures(10, pos_clone, neg_clone)
	feats_test.set_a(feats_train.get_a()) #use prior from training data
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def classifier_svmlight_batch_linadd_modular(
    fm_train_dna, fm_test_dna, label_train_dna, degree, C, epsilon, num_threads
):

    from modshogun import StringCharFeatures, BinaryLabels, DNA
    from modshogun import WeightedDegreeStringKernel, MSG_DEBUG

    try:
        from modshogun import SVMLight
    except ImportError:
        print("No support for SVMLight available.")
        return

    feats_train = StringCharFeatures(DNA)
    # feats_train.io.set_loglevel(MSG_DEBUG)
    feats_train.set_features(fm_train_dna)
    feats_test = StringCharFeatures(DNA)
    feats_test.set_features(fm_test_dna)
    degree = 20

    kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree)

    labels = BinaryLabels(label_train_dna)

    svm = SVMLight(C, kernel, labels)
    svm.set_epsilon(epsilon)
    svm.parallel.set_num_threads(num_threads)
    svm.train()

    kernel.init(feats_train, feats_test)

    # print('SVMLight Objective: %f num_sv: %d' % \)
    # 	(svm.get_objective(), svm.get_num_support_vectors())
    svm.set_batch_computation_enabled(False)
    svm.set_linadd_enabled(False)
    svm.apply().get_labels()

    svm.set_batch_computation_enabled(True)
    labels = svm.apply().get_labels()
    return labels, svm
def features_string_char_compressed_modular(fname):
    from modshogun import StringCharFeatures, StringFileCharFeatures, RAWBYTE
    from modshogun import UNCOMPRESSED, SNAPPY, LZO, GZIP, BZIP2, LZMA, MSG_DEBUG
    from modshogun import DecompressCharString

    f = StringFileCharFeatures(fname, RAWBYTE)

    # print("original strings", f.get_features())

    # uncompressed
    f.save_compressed("tmp/foo_uncompressed.str", UNCOMPRESSED, 1)
    f2 = StringCharFeatures(RAWBYTE)
    f2.load_compressed("tmp/foo_uncompressed.str", True)
    # print("uncompressed strings", f2.get_features())
    # print

    # load compressed data and uncompress on load

    # snappy - not stable yet?!
    # f.save_compressed("tmp/foo_snappy.str", SNAPPY, 9)
    # f2=StringCharFeatures(RAWBYTE);
    # f2.load_compressed("tmp/foo_snappy.str", True)
    # print("snappy strings", f2.get_features())
    # print

    # lzo
    f.save_compressed("tmp/foo_lzo.str", LZO, 9)
    f2 = StringCharFeatures(RAWBYTE)
    f2.load_compressed("tmp/foo_lzo.str", True)
    # print("lzo strings", f2.get_features())
    # print

    ##gzip
    f.save_compressed("tmp/foo_gzip.str", GZIP, 9)
    f2 = StringCharFeatures(RAWBYTE)
    f2.load_compressed("tmp/foo_gzip.str", True)
    # print("gzip strings", f2.get_features())
    # print

    # bzip2
    f.save_compressed("tmp/foo_bzip2.str", BZIP2, 9)
    f2 = StringCharFeatures(RAWBYTE)
    f2.load_compressed("tmp/foo_bzip2.str", True)
    # print("bzip2 strings", f2.get_features())
    # print

    # lzma
    f.save_compressed("tmp/foo_lzma.str", LZMA, 9)
    f2 = StringCharFeatures(RAWBYTE)
    f2.load_compressed("tmp/foo_lzma.str", True)
    # print("lzma strings", f2.get_features())
    # print

    # load compressed data and uncompress via preprocessor
    f2 = StringCharFeatures(RAWBYTE)
    f2.load_compressed("tmp/foo_lzo.str", False)
    f2.add_preprocessor(DecompressCharString(LZO))
    f2.apply_preprocessor()
    # print("lzo strings", f2.get_features())
    # print

    # load compressed data and uncompress on-the-fly via preprocessor
    f2 = StringCharFeatures(RAWBYTE)
    f2.load_compressed("tmp/foo_lzo.str", False)
    # f2.io.set_loglevel(MSG_DEBUG)
    f2.add_preprocessor(DecompressCharString(LZO))
    f2.enable_on_the_fly_preprocessing()
    # print("lzo strings", f2.get_features())
    # print

    # clean up
    import os

    for f in [
        "tmp/foo_uncompressed.str",
        "tmp/foo_snappy.str",
        "tmp/foo_lzo.str",
        "tmp/foo_gzip.str",
        "tmp/foo_bzip2.str",
        "tmp/foo_lzma.str",
        "tmp/foo_lzo.str",
        "tmp/foo_lzo.str",
    ]:
        if os.path.exists(f):
            os.unlink(f)
def features_string_sliding_window_modular (strings):
	from modshogun import StringCharFeatures, DNA
	from modshogun import DynamicIntArray

	f=StringCharFeatures([strings], DNA)

	# slide a window of length 5 over features
	# (memory efficient, does not copy strings)
	f.obtain_by_sliding_window(5,1)
	#print(f.get_num_vectors())
	#print(f.get_vector_length(0))
	#print(f.get_vector_length(1))
	#print(f.get_features())

	# slide a window of length 4 over features
	# (memory efficient, does not copy strings)
	f.obtain_by_sliding_window(4,1)
	#print(f.get_num_vectors())
	#print(f.get_vector_length(0))
	#print(f.get_vector_length(1))
	#print(f.get_features())

	# extract string-windows at position 0,6,16,25 of window size 4
	# (memory efficient, does not copy strings)
	f.set_features([s])
	positions=DynamicIntArray()
	positions.append_element(0)
	positions.append_element(6)
	positions.append_element(16)
	positions.append_element(25)

	f.obtain_by_position_list(4,positions)
	#print(f.get_features())

	# now extract windows of size 8 from same positon list
	f.obtain_by_position_list(8,positions)
	#print(f.get_features())
	return f
def tests_check_commwordkernel_memleak_modular (num, order, gap, reverse):
	import gc
	from modshogun import Alphabet,StringCharFeatures,StringWordFeatures,DNA
	from modshogun import SortWordString, MSG_DEBUG
	from modshogun import CommWordStringKernel, IdentityKernelNormalizer
	from numpy import mat

	POS=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT']
	NEG=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT']

	for i in range(10):
		alpha=Alphabet(DNA)
		traindat=StringCharFeatures(alpha)
		traindat.set_features(POS+NEG)
		trainudat=StringWordFeatures(traindat.get_alphabet());
		trainudat.obtain_from_char(traindat, order-1, order, gap, reverse)
		#trainudat.io.set_loglevel(MSG_DEBUG)
		pre = SortWordString()
		#pre.io.set_loglevel(MSG_DEBUG)
		pre.init(trainudat)
		trainudat.add_preprocessor(pre)
		trainudat.apply_preprocessor()
		spec = CommWordStringKernel(10, False)
		spec.set_normalizer(IdentityKernelNormalizer())
		spec.init(trainudat, trainudat)
		K=spec.get_kernel_matrix()

	del POS
	del NEG
	del order
	del gap
	del reverse
	return K