def __init__(self, lexicon, C=1, num_features=100): self.training_set = None self.classes = None self.test_set = None self.results = None self.kernel = ker.Linear() self.C = C self.feature_data = PATH + "/learning/stored/feature.data" self.label_data = PATH + "/learning/stored/svm_label.data" self.lexicon = lexicon self.num_features = len(self.lexicon.words.keys()) try: print "Loading existing SVM..." features = pickle.load(open(self.feature_data)) labels = pickle.load(open(self.label_data)) sparsedata = SparseDataSet(features, L=labels) self.svm_classifier = loadSVM(PATH + "/learning/stored/svm.classifier",sparsedata) except Exception as e: print e print "Existing SVM not found!" self.svm_classifier = svm.SVM(self.kernel) self.accuracy = None self.predicted_labels = None score = featsel.FeatureScore('golub') self.filter = featsel.Filter(score) self.feature_selector = FeatureSelect(self.svm_classifier, self.filter) self.chain = Chain([self.feature_selector, self.svm_classifier])
def __init__(self, lexicon, C=1, num_features=100): self.training_set = None self.classes = None self.test_set = None self.results = None self.kernel = ker.Linear() self.C = C self.feature_data = PATH + "/learning/stored/feature.data" self.label_data = PATH + "/learning/stored/svm_label.data" self.lexicon = lexicon self.num_features = len(self.lexicon.words.keys()) try: print "Loading existing SVM..." features = pickle.load(open(self.feature_data)) labels = pickle.load(open(self.label_data)) sparsedata = SparseDataSet(features, L=labels) self.svm_classifier = loadSVM(PATH + "/learning/stored/svm.classifier", sparsedata) except Exception as e: print e print "Existing SVM not found!" self.svm_classifier = svm.SVM(self.kernel) self.accuracy = None self.predicted_labels = None score = featsel.FeatureScore("golub") self.filter = featsel.Filter(score) self.feature_selector = FeatureSelect(self.svm_classifier, self.filter) self.chain = Chain([self.feature_selector, self.svm_classifier])
def cross_validate(X,y): # Ask what percentage of the data should be trained p = 0 while p < 50 or p > 100: p = raw_input("Enter percentage of data to train (between 50 and 75): ") p = float(p); K = int(floor(float(p)*len(X)/100)) X1 = X[0:K] y1 = y[0:K] #Load data into PyML's vector objects, then train set data = VectorDataSet(X1,L=y1) s = SVM() s.train(data) s.save("cross_validating") #Now check the other data X2 = X[K+1:-1] y2 = y[K+1:-1] #Load our training data loadedSVM = loadSVM("cross_validating",data) testData = VectorDataSet(X2,L=y2) r = loadedSVM.test(testData) print r #Delete the data now that we're done with it os.system("rm cross_validating")
def on_load_clicked(self, widget): filen = self.getFilenameToRead("Load Classifier",filter='svm') if filen is not None: #db = shelve.open(filen) #if db.has_key('clssfr'): # self.clssfr = db['clssfr'] #else: # self.showMessage("Cannot find a classifier!") #db.close() #with open(filen, 'wb') as f: # self.clssfr = pickle.load(f) datfn = self.getFilenameToRead("Open Training Data",filter='mat') if datfn is not None: data = ml.VectorDataSet(datfn,labelsColumn=0) self.clssfr = loadSVM(filen,data) ## Why do I need to feed data ??? #self.clssfr = loadSVM(filen,None) ## edited PyML for this # classifier has been loaded. need to update button status self.setDisabledBtns() self.showMessage("The classifier has been loaded!")
def on_load_clicked(self, widget): filen = self.getFilenameToRead("Load Classifier", filter='svm') if filen is not None: #db = shelve.open(filen) #if db.has_key('clssfr'): # self.clssfr = db['clssfr'] #else: # self.showMessage("Cannot find a classifier!") #db.close() #with open(filen, 'wb') as f: # self.clssfr = pickle.load(f) datfn = self.getFilenameToRead("Open Training Data", filter='mat') if datfn is not None: data = ml.VectorDataSet(datfn, labelsColumn=0) self.clssfr = loadSVM(filen, data) ## Why do I need to feed data ??? #self.clssfr = loadSVM(filen,None) ## edited PyML for this # classifier has been loaded. need to update button status self.setDisabledBtns() self.showMessage("The classifier has been loaded!")
def test(component="svm", **args): container = "SparseDataSet" if "container" in args: container = args["container"] try: DataSet = getattr(vectorDatasets, container) except: raise ValueError, "wrong container " + container results = {} comp = "general" if component == "all" or component == comp: s = svm.SVM() results = {} d = DataSet(heartdatafile, labelsColumn=0) s.train(d) s.test(d) s = svm.SVM() s.stratifiedCV(d) print "starting aggregate****************" d2 = Aggregate([d, d]) print "end aggregate" r = s.stratifiedCV(d2) d.attachKernel("polynomial") s.cv(d) d.attachKernel("linear") s = svm.SVM() s.train(d) s.train(d, saveSpace=False) s.save("tmp") loaded = svm.loadSVM("tmp", datasetClass=DataSet) r = loaded.test(d) d.attachKernel("gaussian", gamma=0.01) s.train(d, saveSpace=False) s.save("tmp") loaded = svm.loadSVM("tmp", datasetClass=DataSet, labelsColumn=1) r = loaded.test(d) os.remove("tmp") d = DataSet(numpy.random.randn(100, 10)) d = DataSet([[1, 2], [2, 3]]) d = SequenceData(["asa", "ben", "hur"]) comp = "svm" if component == "all" or component == comp: d = DataSet(heartdatafile, labelsColumn=0) results[comp] = [] d.attachKernel("polynomial") s = svm.SVM() results[comp].append(s.cv(d, saveSpace=True)) d.attachKernel("linear") results[comp].append(s.cv(d)) comp = "kernelData" if component == "all" or component == comp: d = DataSet(heartdatafile, labelsColumn=0) results[comp] = [] kdata = KernelData("heart.kernel", gistFormat=True) kdata.attachLabels(d.labels) s = svm.SVM() results[comp].append(s.cv(kdata)) kdata.attachKernel("gaussian", gamma=0.1) results[comp].append(s.cv(kdata)) comp = "normalization" if component == "all" or component == comp: results[comp] = [] data = DataSet(heartdatafile, labelsColumn=0) data.attachKernel("polynomial", degree=4, normalization="dices") s = svm.SVM() results[comp].append(s.cv(data)) comp = "svr" if component == "all" or component == comp: d = DataSet(heartdatafile, labelsColumn=0, numericLabels=True) results[comp] = [] s = svm.SVR() # results[comp].append( # s.cv(d, saveSpace = True)) # results[comp].append( # s.trainTest(d, range(150), range(151, 250))) results[comp].append(s.cv(d)) comp = "save" if component == "all" or component == comp: results[comp] = [] s = svm.SVM() data = DataSet(heartdatafile, labelsColumn=0) import tempfile tmpfile = tempfile.mktemp() r = s.cv(data) r.save(tmpfile) r = assess.loadResults(tmpfile) results["save"].append(r) r = s.nCV(data) r.save(tmpfile) results["save"].append(assess.loadResults(tmpfile)) r = {} for i in range(10): r[i] = s.cv(data) assess.saveResultObjects(r, tmpfile) r = assess.loadResults(tmpfile) comp = "classifiers" if component == "all" or component == comp: d = DataSet(heartdatafile, labelsColumn=0) results[comp] = [] cl = knn.KNN() results[comp].append(cl.stratifiedCV(d)) print "testing ridge regression" ridge = ridgeRegression.RidgeRegression() results[comp].append(ridge.cv(d)) comp = "platt" if component == "all" or component == "platt": results[comp] = [] d = DataSet(heartdatafile, labelsColumn=0) p = platt.Platt2(svm.SVM()) results[comp].append(p.stratifiedCV(d)) comp = "multi" if component == "all" or component == comp: results[comp] = [] d = DataSet(irisdatafile, labelsColumn=-1) mc = multi.OneAgainstOne(svm.SVM()) results[comp].append(mc.cv(d)) d = DataSet(irisdatafile, labelsColumn=-1) mc = multi.OneAgainstRest(svm.SVM()) results[comp].append(mc.cv(d)) mc = multi.OneAgainstRest(svm.SVM()) d.attachKernel("poly") results[comp].append(mc.cv(d)) d.attachKernel("linear") mc = multi.OneAgainstRest(svm.SVM()) # kdata = datafunc.KernelData('iris.linear.kernel', # labelsFile = 'irisY.csv', labelsColumn = 0, gistFormat = True) # results[comp].append(mc.cv(kdata)) comp = "featsel" if component == "all" or component == comp: results[comp] = [] s = svm.SVM() d = DataSet(yeastdatafile, labelsColumn=0) d2 = labels.oneAgainstRest(d, "2") results[comp].append(s.stratifiedCV(d2)) # feature selection using RFE m = composite.FeatureSelect(s, featsel.RFE()) results[comp].append(m.stratifiedCV(d2, 3)) fs = featsel.FeatureScore("golub") f = featsel.Filter(fs, sigma=2) f = featsel.Filter(fs, numFeatures=20) m = composite.FeatureSelect(s, f) results[comp].append(m.stratifiedCV(d2, 3)) # same thing but with a Chain: c = composite.Chain([f, s]) # r = c.stratifiedCV (d2) comp = "modelSelection" if component == "all" or component == comp: results[comp] = [] s = svm.SVM() d = DataSet(heartdatafile, labelsColumn=0) p = modelSelection.ParamGrid(svm.SVM(ker.Polynomial()), "C", [0.1, 1, 10, 100], "kernel.degree", [2, 3, 4]) p = modelSelection.ParamGrid(svm.SVM(ker.Gaussian()), "C", [0.1, 1, 10, 100], "kernel.gamma", [0.01, 0.1, 1]) # p = modelSelection.Param(svm.SVM(), 'C', [0.1, 1, 10, 100]) m = modelSelection.ModelSelector(p, measure="roc", foldsToPerform=2) m = modelSelection.ModelSelector(p) # m = modelSelection.SVMselect() results[comp].append(m.cv(d)) return results
def test(component='svm', **args): container = 'SparseDataSet' if 'container' in args: container = args['container'] try: DataSet = getattr(vectorDatasets, container) except: raise ValueError, 'wrong container ' + container results = {} comp = 'general' if component == 'all' or component == comp: s = svm.SVM() results = {} d = DataSet(heartdatafile, labelsColumn=0) s.train(d) s.test(d) s = svm.SVM() s.stratifiedCV(d) print 'starting aggregate****************' d2 = Aggregate([d, d]) print 'end aggregate' r = s.stratifiedCV(d2) d.attachKernel('polynomial') s.cv(d) d.attachKernel('linear') s = svm.SVM() s.train(d) s.train(d, saveSpace=False) s.save("tmp") loaded = svm.loadSVM("tmp", datasetClass=DataSet) r = loaded.test(d) d.attachKernel('gaussian', gamma=0.01) s.train(d, saveSpace=False) s.save("tmp") loaded = svm.loadSVM("tmp", datasetClass=DataSet, labelsColumn=1) r = loaded.test(d) os.remove('tmp') d = DataSet(numpy.random.randn(100, 10)) d = DataSet([[1, 2], [2, 3]]) d = SequenceData(['asa', 'ben', 'hur']) comp = 'svm' if component == 'all' or component == comp: d = DataSet(heartdatafile, labelsColumn=0) results[comp] = [] d.attachKernel('polynomial') s = svm.SVM() results[comp].append(s.cv(d, saveSpace=True)) d.attachKernel('linear') results[comp].append(s.cv(d)) comp = 'kernelData' if component == 'all' or component == comp: d = DataSet(heartdatafile, labelsColumn=0) results[comp] = [] kdata = KernelData('heart.kernel', gistFormat=True) kdata.attachLabels(d.labels) s = svm.SVM() results[comp].append(s.cv(kdata)) kdata.attachKernel('gaussian', gamma=0.1) results[comp].append(s.cv(kdata)) comp = 'normalization' if component == 'all' or component == comp: results[comp] = [] data = DataSet(heartdatafile, labelsColumn=0) data.attachKernel('polynomial', degree=4, normalization='dices') s = svm.SVM() results[comp].append(s.cv(data)) comp = 'svr' if component == 'all' or component == comp: d = DataSet(heartdatafile, labelsColumn=0, numericLabels=True) results[comp] = [] s = svm.SVR() #results[comp].append( # s.cv(d, saveSpace = True)) #results[comp].append( # s.trainTest(d, range(150), range(151, 250))) results[comp].append(s.cv(d)) comp = 'save' if component == 'all' or component == comp: results[comp] = [] s = svm.SVM() data = DataSet(heartdatafile, labelsColumn=0) import tempfile tmpfile = tempfile.mktemp() r = s.cv(data) r.save(tmpfile) r = assess.loadResults(tmpfile) results['save'].append(r) r = s.nCV(data) r.save(tmpfile) results['save'].append(assess.loadResults(tmpfile)) r = {} for i in range(10): r[i] = s.cv(data) assess.saveResultObjects(r, tmpfile) r = assess.loadResults(tmpfile) comp = 'classifiers' if component == 'all' or component == comp: d = DataSet(heartdatafile, labelsColumn=0) results[comp] = [] cl = knn.KNN() results[comp].append(cl.stratifiedCV(d)) print 'testing ridge regression' ridge = ridgeRegression.RidgeRegression() results[comp].append(ridge.cv(d)) comp = 'platt' if component == 'all' or component == 'platt': results[comp] = [] d = DataSet(heartdatafile, labelsColumn=0) p = platt.Platt2(svm.SVM()) results[comp].append(p.stratifiedCV(d)) comp = 'multi' if component == 'all' or component == comp: results[comp] = [] d = DataSet(irisdatafile, labelsColumn=-1) mc = multi.OneAgainstOne(svm.SVM()) results[comp].append(mc.cv(d)) d = DataSet(irisdatafile, labelsColumn=-1) mc = multi.OneAgainstRest(svm.SVM()) results[comp].append(mc.cv(d)) mc = multi.OneAgainstRest(svm.SVM()) d.attachKernel('poly') results[comp].append(mc.cv(d)) d.attachKernel('linear') mc = multi.OneAgainstRest(svm.SVM()) #kdata = datafunc.KernelData('iris.linear.kernel', # labelsFile = 'irisY.csv', labelsColumn = 0, gistFormat = True) #results[comp].append(mc.cv(kdata)) comp = 'featsel' if component == 'all' or component == comp: results[comp] = [] s = svm.SVM() d = DataSet(yeastdatafile, labelsColumn=0) d2 = labels.oneAgainstRest(d, '2') results[comp].append(s.stratifiedCV(d2)) # feature selection using RFE m = composite.FeatureSelect(s, featsel.RFE()) results[comp].append(m.stratifiedCV(d2, 3)) fs = featsel.FeatureScore('golub') f = featsel.Filter(fs, sigma=2) f = featsel.Filter(fs, numFeatures=20) m = composite.FeatureSelect(s, f) results[comp].append(m.stratifiedCV(d2, 3)) # same thing but with a Chain: c = composite.Chain([f, s]) #r = c.stratifiedCV (d2) comp = 'modelSelection' if component == 'all' or component == comp: results[comp] = [] s = svm.SVM() d = DataSet(heartdatafile, labelsColumn=0) p = modelSelection.ParamGrid(svm.SVM(ker.Polynomial()), 'C', [0.1, 1, 10, 100], 'kernel.degree', [2, 3, 4]) p = modelSelection.ParamGrid(svm.SVM(ker.Gaussian()), 'C', [0.1, 1, 10, 100], 'kernel.gamma', [0.01, 0.1, 1]) #p = modelSelection.Param(svm.SVM(), 'C', [0.1, 1, 10, 100]) m = modelSelection.ModelSelector(p, measure='roc', foldsToPerform=2) m = modelSelection.ModelSelector(p) #m = modelSelection.SVMselect() results[comp].append(m.cv(d)) return results
if line.rstrip() == 'True': temp = 'True' else: temp = 'False' y.append(temp) ## Now it's time to load our data into PyML's vector objects data = VectorDataSet(X2,L=y2) #Create SVM object, then train our set s = SVM() s.train(data) s.save("freePizza") ## Yay! # Now to cross-validate the data; we first take the other set X3 = X[fifth+1:-1]; y3 = y[fifth+1:-1]; print y3 #Load our training data from PyML.classifiers.svm import loadSVM loadedSVM = loadSVM("freePizza",data) testData = VectorDataSet(X3,L=y3) r = loadedSVM.test(testData) print r
def svm_prediction(peptides, job_id, input_train="SVM_POS_NEG.fasta"): """ Makes a final prediction based on SVM training files. This code is used for prediciton of blind datasets, based on the training datasets of positives and negatives. :param peptides: input peptides :param job_id: random job id assigned prior to start predicting :param input_train: input positive and negative examples used in training :return: returns SVM scores for each inputed peptide """ print("Begin SVM") # from methods import load_sqlite, store_sqlite global PATH global TMP_PATH # suppress SVM output devnull = open(os.devnull, 'w') sys.stdout, sys.stderr = devnull, devnull svm_scores = [] # query the database # for peptide in peptides: # try: # score = load_sqlite(peptide, method="SVM", unique=True) # svm_scores.append(score) # except: # pass if len(peptides) == len(svm_scores): pass else: # generate a svm input from the peptides rand = job_id input_svm = "%s_svm.fasta" % rand output_tmp = open(os.path.join(TMP_PATH, input_svm), "w") count = 0 for peptide in peptides: count += 1 output_tmp.write("> %i label=%s\n%s\n" % (count, 1, peptide)) for peptide in peptides: count += 1 output_tmp.write("> %i label=%s\n%s\n" % (count, -1, peptide)) output_tmp.close() # outputs model_svm = "%s_svm_model.txt" % rand # train data train_data = SequenceData(os.path.join(PATH, input_train), mink=1, maxk=1, maxShift=0, headerHandler=svm_process_header) train_data.attachKernel('cosine') cval = 1 s = SVM(C=cval) s.train(train_data) s.save(os.path.join(TMP_PATH, model_svm)) # load trained SVM loaded_svm = loadSVM(os.path.join(TMP_PATH, model_svm), train_data) # test data test_data = SequenceData(os.path.join(TMP_PATH, input_svm), mink=1, maxk=1, maxShift=0, headerHandler=svm_process_header) test_data.attachKernel('cosine') results = loaded_svm.test(test_data) # print results out output_svm = "%s_svm.txt" % rand results.toFile(os.path.join(TMP_PATH, output_svm)) # load results process output (positives + negatives) infile = open(os.path.join(TMP_PATH, output_svm), "r") inlines = infile.readlines() infile.close() scores = list() for line in inlines: line = line.rstrip("\r\n") try: entry = int(line.split("\t")[0]) score = float(line.split("\t")[1]) label = int(line.split("\t")[3]) if label != "-1": scores.append([entry, score]) except: pass # order list sorted_scores = sorted(scores, key=lambda scores: scores[0]) svm_scores = list() for score in sorted_scores: svm_score = score[1] svm_scores.append(svm_score) # remove the temporary model files and results try: os.remove(os.path.join(TMP_PATH, input_svm)) os.remove(os.path.join(TMP_PATH, model_svm)) os.remove(os.path.join(TMP_PATH, output_svm)) except: pass # save the peptides in db # for peptide, score in zip(peptides, svm_scores): # store_sqlite(peptide, method="SVM", information=score, save=True) # restore normal output sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ print("End SVM") return svm_scores