def arffread(kernelname,datafilename): """Decide based on kernelname whether to read a sequence or vectorial file""" if kernelname == 'gauss' or kernelname == 'linear' or kernelname == 'poly' or kernelname == None: fp = init_datasetfile(datafilename,'vec') elif kernelname == 'wd' or kernelname == 'localalign' or kernelname == 'localimprove'\ or kernelname == 'spec' or kernelname == 'cumspec': fp = init_datasetfile(datafilename,'seq') elif kernelname == 'spec2' or kernelname == 'cumspec2': fp = init_datasetfile(datafilename,'mseq') else: print 'Unknown kernel in arffread' return fp.readlines()
def arffread(kernelname, datafilename): """Decide based on kernelname whether to read a sequence or vectorial file""" if kernelname == 'gauss' or kernelname == 'linear' or kernelname == 'poly' or kernelname == None: fp = init_datasetfile(datafilename, 'vec') elif kernelname == 'wd' or kernelname == 'localalign' or kernelname == 'localimprove'\ or kernelname == 'spec' or kernelname == 'cumspec': fp = init_datasetfile(datafilename, 'seq') elif kernelname == 'spec2' or kernelname == 'cumspec2': fp = init_datasetfile(datafilename, 'mseq') else: print 'Unknown kernel in arffread' return fp.readlines()
def fastaread(fnamepos, fnameneg=None): """Read two fasta files, the first positive, the second negative""" fpos = init_datasetfile(fnamepos, 'seq') (fa1, lab1) = fpos.readlines() if fnameneg is not None: fneg = init_datasetfile(fnameneg, 'seq') (fa2, lab2) = fneg.readlines() print 'positive: %d, negative %d' % (len(fa1), len(fa2)) all_labels = concatenate((ones(len(fa1)), -ones(len(fa2)))) all_examples = fa1 + fa2 else: all_examples = fa1 all_labels = ones(len(fa1)) return all_examples, all_labels
def fastaread(fnamepos,fnameneg=None): """Read two fasta files, the first positive, the second negative""" fpos = init_datasetfile(fnamepos,'seq') (fa1,lab1) = fpos.readlines() if fnameneg is not None: fneg = init_datasetfile(fnameneg,'seq') (fa2,lab2) = fneg.readlines() print 'positive: %d, negative %d' % (len(fa1),len(fa2)) all_labels = concatenate((ones(len(fa1)),-ones(len(fa2)))) all_examples = fa1 + fa2 else: all_examples = fa1 all_labels = ones(len(fa1)) return all_examples, all_labels
def fastawrite_sequence(filename,p): """Write a FASTA file containing a sequence dataset""" import arff (metadata,seqlist) = motifgen(p.motif, p.numseq, p.seqlenmin, p.seqlenmax, p.posstart, p.posend, p.mutrate) labels = ones(len(seqlist)) fp = init_datasetfile(filename,'seq') fp.writelines(seqlist,labels)
def arffwrite_real(filename, numpoint, numfeat, fracpos=0.5, width=1.0): """Write an ARFF file containing a vectorial dataset""" #import arff (metadata, pointcloud, labels) = cloudgen(numpoint, numfeat, fracpos, width) fp = init_datasetfile(filename,'vec') fp.comment = metadata fp.dataname = 'pointcloud' fp.writelines(pointcloud,labels)
def arffwrite_sequence(filename,p, n): """Write an ARFF file containing a sequence dataset""" #import arff (metadatapos,seqlistpos) = motifgen(p.motif, p.numseq, p.seqlenmin, p.seqlenmax, p.posstart, p.posend, p.mutrate) (metadataneg,seqlistneg) = motifgen(n.motif, n.numseq, n.seqlenmin, n.seqlenmax, n.posstart, n.posend, n.mutrate) labels = concatenate((ones(len(seqlistpos)),-ones(len(seqlistneg)))) seqlist = seqlistpos + seqlistneg fp = init_datasetfile(filename,'seq') fp.comment = metadatapos+' '+metadataneg fp.dataname = 'motif' fp.writelines(seqlist,labels)
def test_gc(gcfilename): """ Check the gc content files for conflicting labels """ fp = init_datasetfile(gcfilename,'vec') (examples,labels) = fp.readlines() print '%d positive and %d negative examples' % (sum(labels>0.0),sum(labels<0.0)) distance = sqr_dist(numpy.matrix(examples),numpy.matrix(examples)) labdist = numpy.matrix(labels).T*numpy.matrix(labels) #difflab = numpy.where(labdist.A<0,distance,numpy.matlib.ones((len(labels),len(labels)))) contracount = 0 for ix in xrange(len(labels)): for iy in xrange(ix+1,len(labels)): if labdist[ix,iy]<0 and distance[ix,iy]<0.01: contracount += 1 print distance.shape, labdist.shape #print '%d identical examples with opposing labels' %len(numpy.unique(numpy.where(difflab==0)[0])) print '%d identical examples with opposing labels' % contracount
def test_gc(gcfilename): """ Check the gc content files for conflicting labels """ fp = init_datasetfile(gcfilename, 'vec') (examples, labels) = fp.readlines() print '%d positive and %d negative examples' % (sum(labels > 0.0), sum(labels < 0.0)) distance = sqr_dist(numpy.matrix(examples), numpy.matrix(examples)) labdist = numpy.matrix(labels).T * numpy.matrix(labels) #difflab = numpy.where(labdist.A<0,distance,numpy.matlib.ones((len(labels),len(labels)))) contracount = 0 for ix in xrange(len(labels)): for iy in xrange(ix + 1, len(labels)): if labdist[ix, iy] < 0 and distance[ix, iy] < 0.01: contracount += 1 print distance.shape, labdist.shape #print '%d identical examples with opposing labels' %len(numpy.unique(numpy.where(difflab==0)[0])) print '%d identical examples with opposing labels' % contracount
(n.seqlenmin, n.seqlenmax) = esvm.parse.parse_range(sys.argv[10]) (n.posstart, n.posend) = esvm.parse.parse_range(sys.argv[11]) n.mutrate = float(sys.argv[12]) filename = sys.argv[13] arffwrite_sequence(filename, p, n) elif sys.argv[1] == 'cloud': # generate a data cloud in ARFF format numpoint = int(sys.argv[2]) numfeat = int(sys.argv[3]) fracpos = float(sys.argv[4]) width = float(sys.argv[5]) filename = sys.argv[6] arffwrite_real(filename, numpoint, numfeat, fracpos, width) if len(sys.argv) >= 8: fp = init_datasetfile(filename, 'vec') (examples, labels) = fp.readlines() pointcloud = [] for ix in xrange(numpoint): pointcloud.append( array([labels[ix], examples[0, ix], examples[1, ix]])) esvm.plots.plotcloud(pointcloud, sys.argv[7], 'Pointcloud') #(examples,labels,metadata)=arffwrite_real(filename, numpoint, numfeat, fracpos, width) #if len(sys.argv)>=8: # plots.plotcloud(pointcloud,sys.argv[7],metadata) else: print 'Unknown option %s\n' % sys.argv[1]
def splice_example(Cs, gcfilename,seqfilename,seq2filename, plot=False): """ For the data files, apply the set of kernels """ # hyperparameters num_fold_cv = 5 # The area under the receiver operating characteristic results=[] # Read datasets # GC features fp = init_datasetfile(gcfilename,'vec') (gc_examples,gc_labels) = fp.readlines() gc_examples = normalize(gc_examples, subtract_mean=True) if plot: from pylab import scatter,show color=['b','r'] scatter(gc_examples[0,], gc_examples[1,], s=400*(gc_labels+2), c=''.join([ color[(int(i)+1)/2] for i in gc_labels]), alpha=0.1) show() # 2 sequence features fp = init_datasetfile(seq2filename,'mseq') (dna2_examples,dna2_labels) = fp.readlines() # DNA sequences fp = init_datasetfile(seqfilename,'seq') (dna_examples,dna_labels) = fp.readlines() #Define experiments to carry out experiments=( # Linear kernel on GC content ('linear', {'scale':1.0, 'name':'scale'}, (gc_examples, gc_labels)), # Polynomial kernel on GC content ( 'poly', {'degree':3, 'name':'degree', 'inhomogene':True, 'normal':True}, (gc_examples, gc_labels)), ( 'poly', {'degree':5, 'name':'degree', 'inhomogene':True, 'normal':True}, (gc_examples, gc_labels)), # Gaussian kernel on GC content ('gauss', {'width':100.0, 'name':'width'}, (gc_examples, gc_labels)), ('gauss', {'width':1.0, 'name':'width'}, (gc_examples, gc_labels)), ('gauss', {'width':0.01, 'name':'width'}, (gc_examples, gc_labels)), # Spectrum kernel on 2 dna sequences ('spec2', {'degree':1, 'name':'degree'}, (dna2_examples, dna2_labels)), ('spec2', {'degree':3, 'name':'degree'}, (dna2_examples, dna2_labels)), ('spec2', {'degree':5, 'name':'degree'}, (dna2_examples, dna2_labels)), # Cumulative Spectrum kernel on 2 dna sequences ('cumspec2', {'degree':1, 'name':'degree'}, (dna2_examples, dna2_labels)), ('cumspec2', {'degree':3, 'name':'degree'}, (dna2_examples, dna2_labels)), ('cumspec2', {'degree':5, 'name':'degree'}, (dna2_examples, dna2_labels)), # Weighted degree kernel on dna sequences ('wd', {'degree':1,'shift':0, 'name':'degree'}, (dna_examples, dna_labels)), ('wd', {'degree':3,'shift':0, 'name':'degree'}, (dna_examples, dna_labels)), ('wd', {'degree':5,'shift':0, 'name':'degree'}, (dna_examples, dna_labels)) ) if Cs is None: for C in (0.01, 0.1, 1, 2, 5, 10): for e in experiments: run_single_experiment(results, num_fold_cv, e[0], e[1], C, e[2][0], e[2][1]) else: for i in xrange(len(experiments)): e=experiments[i] run_single_experiment(results, num_fold_cv, e[0], e[1], Cs[i], e[2][0], e[2][1]) return results
def splice_example(Cs, gcfilename, seqfilename, seq2filename, plot=False): """ For the data files, apply the set of kernels """ # hyperparameters num_fold_cv = 5 # The area under the receiver operating characteristic results = [] # Read datasets # GC features fp = init_datasetfile(gcfilename, 'vec') (gc_examples, gc_labels) = fp.readlines() gc_examples = normalize(gc_examples, subtract_mean=True) if plot: from pylab import scatter, show color = ['b', 'r'] scatter(gc_examples[0, ], gc_examples[1, ], s=400 * (gc_labels + 2), c=''.join([color[(int(i) + 1) / 2] for i in gc_labels]), alpha=0.1) show() # 2 sequence features fp = init_datasetfile(seq2filename, 'mseq') (dna2_examples, dna2_labels) = fp.readlines() # DNA sequences fp = init_datasetfile(seqfilename, 'seq') (dna_examples, dna_labels) = fp.readlines() #Define experiments to carry out experiments = ( # Linear kernel on GC content ('linear', { 'scale': 1.0, 'name': 'scale' }, (gc_examples, gc_labels)), # Polynomial kernel on GC content ('poly', { 'degree': 3, 'name': 'degree', 'inhomogene': True, 'normal': True }, (gc_examples, gc_labels)), ('poly', { 'degree': 5, 'name': 'degree', 'inhomogene': True, 'normal': True }, (gc_examples, gc_labels)), # Gaussian kernel on GC content ('gauss', { 'width': 100.0, 'name': 'width' }, (gc_examples, gc_labels)), ('gauss', { 'width': 1.0, 'name': 'width' }, (gc_examples, gc_labels)), ('gauss', { 'width': 0.01, 'name': 'width' }, (gc_examples, gc_labels)), # Spectrum kernel on 2 dna sequences ('spec2', { 'degree': 1, 'name': 'degree' }, (dna2_examples, dna2_labels)), ('spec2', { 'degree': 3, 'name': 'degree' }, (dna2_examples, dna2_labels)), ('spec2', { 'degree': 5, 'name': 'degree' }, (dna2_examples, dna2_labels)), # Cumulative Spectrum kernel on 2 dna sequences ('cumspec2', { 'degree': 1, 'name': 'degree' }, (dna2_examples, dna2_labels)), ('cumspec2', { 'degree': 3, 'name': 'degree' }, (dna2_examples, dna2_labels)), ('cumspec2', { 'degree': 5, 'name': 'degree' }, (dna2_examples, dna2_labels)), # Weighted degree kernel on dna sequences ('wd', { 'degree': 1, 'shift': 0, 'name': 'degree' }, (dna_examples, dna_labels)), ('wd', { 'degree': 3, 'shift': 0, 'name': 'degree' }, (dna_examples, dna_labels)), ('wd', { 'degree': 5, 'shift': 0, 'name': 'degree' }, (dna_examples, dna_labels))) if Cs is None: for C in (0.01, 0.1, 1, 2, 5, 10): for e in experiments: run_single_experiment(results, num_fold_cv, e[0], e[1], C, e[2][0], e[2][1]) else: for i in xrange(len(experiments)): e = experiments[i] run_single_experiment(results, num_fold_cv, e[0], e[1], Cs[i], e[2][0], e[2][1]) return results
n.numseq = int(sys.argv[9]) (n.seqlenmin,n.seqlenmax) = esvm.parse.parse_range(sys.argv[10]) (n.posstart,n.posend) = esvm.parse.parse_range(sys.argv[11]) n.mutrate = float(sys.argv[12]) filename = sys.argv[13] arffwrite_sequence(filename, p, n) elif sys.argv[1] == 'cloud': # generate a data cloud in ARFF format numpoint = int(sys.argv[2]) numfeat = int(sys.argv[3]) fracpos = float(sys.argv[4]) width = float(sys.argv[5]) filename = sys.argv[6] arffwrite_real(filename, numpoint, numfeat, fracpos, width) if len(sys.argv)>=8: fp = init_datasetfile(filename,'vec') (examples,labels) = fp.readlines() pointcloud = [] for ix in xrange(numpoint): pointcloud.append(array([labels[ix],examples[0,ix],examples[1,ix]])) esvm.plots.plotcloud(pointcloud,sys.argv[7],'Pointcloud') #(examples,labels,metadata)=arffwrite_real(filename, numpoint, numfeat, fracpos, width) #if len(sys.argv)>=8: # plots.plotcloud(pointcloud,sys.argv[7],metadata) else: print 'Unknown option %s\n' % sys.argv[1]