def evaluate(fmat, labels, targets, method, k, c_param, nu_param, learn_rate, n_estimators): nptargets = np.array(targets) out = [] detailed = [[[], []], [[], []]] for tr, ts in mlpy.cv_kfold(len(labels), k, strat=labels): clas = train_classifier(fmat[tr], labels[tr], method, c_param, nu_param, learn_rate, n_estimators) try: pred = clas.pred(fmat[ts]) except AttributeError: pred = clas.predict(fmat[ts]) for target, x, y in zip(nptargets[ts], labels[ts], pred): detailed[x][int(y)].append(target) tp = [x == int(y) == 1 for x, y in zip(labels[ts], pred)].count(True) tn = [x == int(y) == 0 for x, y in zip(labels[ts], pred)].count(True) fp = [x == 0 and int(y) == 1 for x, y in zip(labels[ts], pred)].count(True) fn = [x == 1 and int(y) == 0 for x, y in zip(labels[ts], pred)].count(True) try: precision = tp / (tp + fp) except: precision = None try: recall = tp / (tp + fn) except: recall = None try: accuracy = (tp + tn) / (tp + tn + fp + fn) except: accuracy = None out.append((precision, recall, accuracy)) return out, detailed
def cross_validation(data,y): idx=mlpy.cv_kfold(numpy.size(data,0), numpy.size(data,0), strat=None, seed=4) corrects=0 falses=0 confusion=numpy.zeros((3,3)) testavg = 0 testavgcount = 0 test_res=0 for tr,ts in idx: X_tr=data[tr,:] Y_tr=y[tr] X_ts=data[ts,:] Y_ts=y[ts] model=train(X_tr,Y_tr) y_training_pred=model.pred(X_tr) y_trained=y_training_pred.copy() y_dat=model.pred(X_ts) acc,conf=evaluate(y_trained,Y_tr) testavg = testavg + acc acc,confusion=evaluate(y_dat,Y_ts,confusion) test_res=test_res+acc testavgcount = testavgcount + 1 return confusion, testavg/float(testavgcount), test_res/float(testavgcount)
def kfold(vectors, labels, count): """ Returns k-folded sets as a tuple of (training_x, training_y, testing_x, testing_y) """ x_splitted = [] y_splitted = [] clearx_splitted = [] cleary_splitted = [] for training, testing in mlpy.cv_kfold(n=len(vectors), k=count): x_splitted.append([vectors[ind] for ind in training]) y_splitted.append([labels[ind] for ind in training]) clearx_splitted.append([vectors[ind] for ind in testing]) cleary_splitted.append([labels[ind] for ind in testing]) return (x_splitted, y_splitted, clearx_splitted, cleary_splitted)
ys=[] if random_labels: np.random.seed(0) tmp = y.copy() np.random.shuffle(tmp) for i in range(CV_N): ys.append(tmp) else: for i in range(CV_N): ys.append(y) for n in range(CV_N): seed = n while True: idx = mlpy.cv_kfold(n=x.shape[0], k=CV_K, strat=ys[n], seed=seed) for i, (idx_tr, idx_ts) in enumerate(idx): x_tr, x_ts = x[idx_tr], x[idx_ts] if any(np.var(x_tr, axis=0) == 0): seed += CV_N break else: break if seed > n + CV_N * KFOLD_TRY: raise IOError, 'filter threshold should be more higher' print "%d over %d experiments" % (n+1, CV_N) for i, (idx_tr, idx_ts) in enumerate(idx):
## run CV for transitions ## IN CONTROL PATIENTS from operator import itemgetter from sklearn import metrics import mlpy x = feature_matrix_med_counts_IN_CONTROL_FROM_ALL #set x (feature matrix) y = np.array(y_IN_CONTROL_TRANSITION_FROM_ALL) #vector with classes (HTN control status); cast it as numpy array #cv PARAMETERS numsamples = len(y) numfolds = 10 idx = mlpy.cv_kfold(n=numsamples, k=10) #for tr, ts in idx: print(tr, ts) #print out the indexes for CV #do a k-fold CV d_results = {'ACCURACY': None, 'AUC': None, 'SENSITIVITY': None, 'SPECIFICITY' :None, 'CORRECT_PER_FOLD' : [], 'NUM_FOLD': numfolds,'AUC_PER_FOLD': [], 'FPR_PER_FOLD':[], 'TPR_PER_FOLD':[], 'ACC_PER_FOLD':[], 'SENS_PER_FOLD': [] ,'SPEC_PER_FOLD':[], 'REPORT_PER_FOLD': [], 'PPV_PER_FOLD':[], 'NPV_PER_FOLD':[], 'PPV': None, 'NPV': None} for tr, ts in idx: trainset_samples = itemgetter(*tr)(y) testset_samples = itemgetter(*ts)(y) trainset_features = itemgetter(*tr)(x) testset_features = itemgetter(*ts)(x) #build the regression model ################################### model_logistic = mlpy.LibLinear(solver_type='l2r_lr') #default: mlpy.LibLinear(solver_type='l2r_lr', C=1, eps=0.01, weight={})
from sklearn.cross_validation import train_test_split from sklearn.grid_search import GridSearchCV from sklearn.metrics import classification_report from sklearn.svm import SVC from sklearn import neighbors, datasets from sklearn import tree from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import confusion_matrix datos = np.loadtxt('datasetnew.txt', delimiter=',') x, y = datos[:, :10], datos[:, 10].astype(np.int) # x = x - np.mean(x, axis=0) xn = x / np.sqrt(np.sum(x**2, axis=0)) kfold = mlpy.cv_kfold(len(xn), len(xn)) #----------------------------------------------------------- # percent = [] # # 0.722916666667 # result = [] # classifier = LDA() # classifier.fit(xn, y) # for crossval in range(len(kfold)): # trainindex = kfold[crossval][0].tolist() # evaluateindex = kfold[crossval][1].tolist() # trainx = xn[trainindex] # trainy = y[trainindex] # evaluatex = xn[evaluateindex] # evaluatey = y[evaluateindex] #
v.append(str(m).rpartition(",")[2]) #close the file object fo.close() return r def correctVals(array): c = 0 for d in array: array[c] = abs(int(d)) c += 1 return array data = openFile() x = data[0] y = data[1] i = mlpy.cv_kfold(n=len(x), k=10) m = 0 KNNRESULTS = [] DTRESULTS = [] LRRESULTS = [] while (len(i)>0): m += 1 trainx = [] trainy = [] testx = [] testy = [] indx = i.pop() for c in indx[0]: