def testBroad(): data = Data() geneExp = data.get_gene_exp_matrix() labels = data.get_labels() geneNames = data.get_gene_names() print(labels) print(geneExp[0]) print(geneExp[5]) print(geneExp[17]) print(len(geneExp)) print(len(geneExp[0]))
def ecocfn(code_size=52./38, C=50, linSVC_L1=False, kernel ='linear', selection='chi2',numFeatures=330):#try different code sizes """l1Reg is used only if linSVC=True""" data = Data() gene_exp = data.get_gene_exp_matrix() labels = data.get_labels() if linSVC_L1: clf = svm.LinearSVC(C=C,penalty='l1',dual=False) else: clf = svm.SVC(C=C,kernel=kernel) ecocAlgo = OutputCodeClassifier(clf, code_size=code_size, random_state=0) print(leaveOneOutCrossValid(gene_exp,labels,ecocAlgo,selection=selection,numFeatures=numFeatures))
def svmfn(featureSelectionMethod = 'none',numFeaturesA = '330'): data = Data() gene_exp = data.get_gene_exp_matrix() labels = data.get_labels() names = data.get_gene_names() #USES a 1 vs 1 scheme - how does this work? clf = svm.SVC(C=125.,kernel='linear') #kernel can be poly, rbf, linear, sigmoid clfwrapper = OneVsRestClassifier(clf); #accuracy = leaveOneOutCrossValid(gene_exp,labels,clfwrapper,names=names,selection=featureSelectionMethod,numFeatures=numFeaturesA) #print 'accuracy is' #print accuracy trainingError =trainingSetPerformance(gene_exp,labels,clfwrapper,names=names,selection=featureSelectionMethod,numFeatures=numFeaturesA) #print 'accuracy is' #print accuracy print 'trainingError is' print trainingError
def svmfn(featureSelectionMethod = 'none',numFeatures = '330'): data = Data() #data = RaviNormal() gene_exp = data.get_gene_exp_matrix() labels = data.get_labels() names = data.get_gene_names() #clf = svm.LinearSVC(C=125.,penalty="l1",dual=False,class_weight='auto') clf = svm.LinearSVC(C=125,penalty="l1",dual=False) clfwrapper = OneVsRestClassifier(clf); #accuracy = leaveOneOutCrossValid(gene_exp,labels,clfwrapper,names=names,selection=featureSelectionMethod,numFeatures=numFeatures) trainingError =trainingSetPerformance(gene_exp,labels,clfwrapper,names=names,selection=featureSelectionMethod,numFeatures=numFeatures) print 'accuracy is' #print accuracy print 'trainingError is' print trainingError estimators = clfwrapper.estimators_ j = 0 totalGeneListLength = 0 for estimator in estimators: print 'estimator for class' print data.getCellName(j) i = range(0,len(estimator.coef_[0])) b = sorted(zip(estimator.coef_[0], i), reverse=True)[:80] #TODO CHANGE indices = data.indices_of_celltype(j) #print 'indices of this class:' #print indices arraysum = [0.0]*11927 arraysum = numpy.array(arraysum) for i in indices: arraysum = numpy.add(arraysum,gene_exp[i]) arrayavg = numpy.divide(arraysum,len(indices)) k = 0 geneList = [] while k<80 and b[k][0] > 0: #TODO CHANGE avg_expr = arrayavg[b[k][1]] geneStr = str(b[k][0])+',' +names[b[k][1]] + ':' + str(avg_expr) geneList = geneList + [geneStr] k = k+1 j = j+1 print geneList print len(geneList) totalGeneListLength += len(geneList) print 'avg gene signature size:' print totalGeneListLength/35