def generateLearningCurve(X, y, degree, regLambda): """ computing learning curve via leave one out CV """ n = len(X) errorTrains = np.zeros((n, n - 1)) errorTests = np.zeros((n, n - 1)) loo = model_selection.LeaveOneOut() itrial = 0 for train_index, test_index in loo.split(X): #print("TRAIN indices:", train_index, "TEST indices:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] (errTrain, errTest) = learningCurve(X_train, y_train, X_test, y_test, regLambda, degree) errorTrains[itrial, :] = errTrain errorTests[itrial, :] = errTest itrial = itrial + 1 errorTrain = errorTrains.mean(axis=0) errorTest = errorTests.mean(axis=0) plotLearningCurve(errorTrain, errorTest, regLambda, degree)
def leave_out_example(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 2, 1, 2]) groups = np.array([0, 0, 2, 2]) if False: lo = model_selection.LeavePOut(p=2) print('#splits =', lo.get_n_splits(X)) elif False: # The same group will not appear in two different folds. # The number of distinct groups has to be at least equal to the number of folds. lo = model_selection.LeaveOneGroupOut() #print('#splits =', lo.get_n_splits(X, y, groups)) print('#splits =', lo.get_n_splits(groups=groups)) elif False: # The same group will not appear in two different folds. # The number of distinct groups has to be at least equal to the number of folds. lo = model_selection.LeaveOneGroupOut(n_groups=2) #print('#splits =', lo.get_n_splits(X, y, groups)) print('#splits =', lo.get_n_splits(groups=groups)) else: lo = model_selection.LeaveOneOut() print('#splits =', lo.get_n_splits(X)) print('Leave-out:', lo) #for train_indices, test_indices in lo.split(X, y, groups): for train_indices, test_indices in lo.split(X): #print('TRAIN:', train_indices.shape, 'TEST:', test_indices.shape) print('TRAIN:', train_indices, 'TEST:', test_indices) X_train, X_test = X[train_indices], X[test_indices] y_train, y_test = y[train_indices], y[test_indices]
def test_split(self): X = np.array([1, 2, 3, 4]) fold1 = model_selection.LeaveOneOut().split(X) fold2 = sklearn_model_selection.LeaveOneOut().split(X) self.assertFoldEqual(fold1, fold2)
def caseLOO(X, Y, para): para = 0 loo = skmdls.LeaveOneOut() N = label_all.shape[1] mdl = loo X_train, X_test, y_train, y_test = train_test_constructor(N, mdl, X, Y) return X_train, X_test, y_train, y_test, N
def cv_LinearRegression_Bias( xM, yV): """ N_it times iteration is performed for cross_validation in order to make further average effect. The flag of 'disp' is truned off so each iteration will not shown. """ #print( "cv_LinearRegression_None", xM.shape, yV.shape) X, y = np.array( xM)[:,0], np.array( yV)[:,0] # only 1-dim is allowed for both X and y assert (X.ndim == 1) or (X.shape[2] == 1) and (yV.ndim == 1) or (yV.shape[2] == 1) loo_c = model_selection.LeaveOneOut() loo = loo_c.split( X) yP = y.copy() for train, test in loo: bias = np.mean(y[train] - X[train]) yP[test] = X[test] + bias cv_score_le = np.abs( np.array( y - yP)).tolist() o_d = {'median_abs_err': np.median( cv_score_le), 'mean_abs_err': np.mean( cv_score_le), 'std_abs_err': np.std( cv_score_le), # this can be std(err) 'list': cv_score_le, 'ci': "t.b.d", 'yVp': X.tolist()} return o_d
def __init__(self): f = open(r'C:\Sagar_Agrawal\Projects\ML\Codes\output\Reports\out.csv', encoding='utf-8') # firstrow = f.readline() # skip the header data = np.loadtxt(f, skiprows=1, delimiter=',') X = data[:, 1:] # select columns 1 through end y = data[:, 0] # select column 0, the risk imageno = len(y) healthy = np.zeros(np.sum(y == 0), np.int) diseased = np.zeros(np.sum(y != 1), np.int) j = 0 k = 0 for i in range(0, imageno, 1): if y[i] == 0: healthy[j] = i j += 1 else: diseased[k] = i k += 1 loo = model_selection.LeaveOneOut() self.classify(X, y, healthy, diseased, loo)
def split(self, X, y=None): folds = [] loo = model_selection.LeaveOneOut() for train, test in loo.split(X): folds.append((train, test)) return folds
def cv_model(nsplits): if nsplits <= 0: cv = None elif nsplits == 1: cv = model_selection.LeaveOneOut() else: cv = model_selection.KFold(n_splits=nsplits) return cv
def do_leave_one_out_cv(model): # Leave one out cross validation from sklearn import model_selection loocv = model_selection.LeaveOneOut() results = model_selection.cross_val_score(model, x_train, y_train, cv=loocv) for fold in results: print("Accuracy: {:.2%}".format(fold))
def train_and_test(mlp, X, y): loo = skselection.LeaveOneOut() hits = 0 for train_index, test_index in loo.split(X): X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] mlp.fit(X_train, y_train) if mlp.predict(X_test) == y_test: hits += 1 accuracy = hits / X.shape[0] * 100 return accuracy
def runOneOut(): num_folds = 5 num_instances = len(X) num_trees = 50 loocv = model_selection.LeaveOneOut() loocv.get_n_splits(X) model = SVC() results = model_selection.cross_val_score(model, X, Y.ravel(), cv=loocv,n_jobs=-1) #print("LogisticRegression Accuracy: %.3f%% (%.3f%%)") % (results.mean() * 100.0, results.std() * 100.0) print("Accuracy: %0.2f (+/- %0.2f)" % (results.mean(), results.std() * 2)) print(results.std())
def leaveOneOutCrossValidationEvaluation(self): Model.models[:] = [] __results = [] __names = [] __looCrossValidation = model_selection.LeaveOneOut() for name, model in self.__models: cv_results = model_selection.cross_val_score( model, self.x_train, self.y_train, cv=__looCrossValidation) __results.append(cv_results) __names.append(name) Model.models.append( Model(name, cv_results.mean(), cv_results.std())) return Model.getHighestScore()[0]
def tune_PLSR(x, y): """ Parameter tuning of PLS regression """ n_comp_range = range(1, int(maxComp)) param_grid = dict(n_components=n_comp_range) scorer = make_scorer(mean_squared_error, greater_is_better=False) # Leave-one-out cross validation cv = model_selection.LeaveOneOut() cv.get_n_splits(x) # grid search grid = model_selection.GridSearchCV(PLSRegression(), param_grid=param_grid, scoring=scorer, cv=cv) grid.fit(x, y) scores = grid.grid_scores_ return grid, scores
def cv_detail(): iris = datasets.load_iris() lr = linear_model.LogisticRegression() print( model_selection.cross_val_score(lr, iris.data, iris.target, cv=model_selection.KFold())) # [ 0. 0. 0.] print( model_selection.cross_val_score(lr, iris.data, iris.target, cv=model_selection.KFold(n_splits=5))) # [ 1. 0.93333333 0.43333333 0.96666667 0.43333333] print( model_selection.cross_val_score(lr, iris.data, iris.target, cv=model_selection.KFold( shuffle=True, random_state=0))) # [ 0.9 0.96 0.96] print( model_selection.cross_val_score(lr, iris.data, iris.target, cv=model_selection.KFold( shuffle=True, random_state=0, n_splits=5))) # [ 0.96666667 0.9 0.96666667 0.96666667 0.93333333] # 전체 데이터 개수대로 나누어버림 # 데이터 개수가 작을 때 사용하는 방법 LeaveOneOut() loocv = model_selection.cross_val_score(lr, iris.data, iris.target, cv=model_selection.LeaveOneOut()) print(loocv) print(len(loocv)) loocv = model_selection.cross_val_score( lr, iris.data, iris.target, cv=model_selection.KFold(n_splits=150)) print(loocv.mean())
def evaluateModel(self, model, features, classes, train_size=0.7): XT, XF, YT, YF = model_selection.train_test_split( features, classes, train_size) kf2 = model_selection.KFold(n_splits=5, shuffle=True, random_state=12345) # https: // scikit - learn.org / stable / modules / cross_validation.html # cross-validation # https://chrisalbon.com/machine_learning/model_evaluation/cross_validation_parameter_tuning_grid_search/ # Разбивает так, что каждый элемент единожды попадает в тестовую выборку, по очереди kf1 = model_selection.KFold(n_splits=5, shuffle=False, random_state=12345) # Разбивает так, что каждый элемент единожды попадает в тестовую выборку, случайный порядок kf2 = model_selection.KFold(n_splits=5, shuffle=True, random_state=12345) # Разбивает так, что все тестовые выборки содержат примерно одинаковое количество эл-тов разных классов kf3 = model_selection.StratifiedKFold(n_splits=5, shuffle=False, random_state=12345) # Разбивает в случайном порядке, элементы могут повторяться kf4 = model_selection.ShuffleSplit(n_splits=10, random_state=12345) # Разбивает в случайном порядке, элементы могут повторяться, тестовые выборки содержат примерно одинаковое количество эл-тов разных классов kf5 = model_selection.StratifiedShuffleSplit(n_splits=10, random_state=12345) # делает N тестовых выборок, содержащих поочередно каждый элемент kf6 = model_selection.LeaveOneOut() self.trainModel(model, XT, YT) YP = self.predictModel(model, XF) acc = metrics.accuracy_score(YF, YP) prec = metrics.precision_score(YF, YP) rec = metrics.recall_score(YF, YP) f1 = metrics.f1_score(YF, YP) return f1, prec, rec, acc
def MODEL_CV(cv_type="KFold", n_splits=N_SPLITS, random_state=RANDOM_STATE, test_size=TEST_SIZE, scoring=SCORING, shuffle=SHUFFLE): if cv_type == "KFold": return model_selection.KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state) elif cv_type == "LeaveOneOut": return model_selection.LeaveOneOut() elif cv_type == "ShuffleSplit": return model_selection.ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state) else: raise Exception()
def resultValidation(model_classifier,featureValues,type_label,validator = 1): """ Function estimates the models prediction accuracy by calling a selected cross validation method, returns the KFOLD cross validator by default :param model_classifier: Input Classifier :param featureValues: List of Feature values :param type_label: List of label results :return: a string of the model's prediction accuracy """ if not isinstance(validator, int): raise Exception("Non Integer Value entered into result validation function") elif type(featureValues) != list or type(type_label) != list: raise Exception("Non list values added in feature value/type label parameter") elif validator<0: raise Exception("Invalid validator selected") elif len(featureValues) < 10 or len(type_label) < 10: raise Exception("Number of samples cannot be less than 10") else: # K-FOLD CROSS VALIDATION if (validator==1): kfold = KFold(n_splits=10) # <- Change split number here model_Kfold = model_classifier results_Kfold = model_selection.cross_val_score(model_Kfold,featureValues,type_label,cv=kfold) return '{0:.2f}'.format(results_Kfold.mean()*100.0) # STRATIFIED K-FOLD CROSS VALIDATION elif (validator==2): skfold = StratifiedKFold(n_splits= 10) # <- Change split number here model_SKfold = model_classifier results_SKfold = model_selection.cross_val_score(model_SKfold,featureValues,type_label,cv=skfold) return '{0:.2f}'.format(results_SKfold.mean()*100.0) # LEAVE ONE OUT CROSS VALIDATION (LOOCV) elif (validator==3): loocv = model_selection.LeaveOneOut() model_loocv = model_classifier results_loocv = model_selection.cross_val_score(model_loocv,featureValues,type_label,cv=loocv) return '{0:.2f}'.format(results_loocv.mean()*100.0) # REPEATED RANDOM TEST-TRAIN SPLITS else: rrtt = model_selection.ShuffleSplit(n_splits=10, test_size=0.30, random_state=100) # <- Change split number and test_size here model_shufflecv = model_classifier results_4 = model_selection.cross_val_score(model_shufflecv, featureValues, type_label, cv=rrtt) return '{0:.2f}'.format(results_4.mean()*100.0)
def _PARA_GRIDDING( Model, X, y, param_grid, _Scale = False, _CVType = "KFold", n_splits = N_SPLITS, random_state = RANDOM_STATE, scoring = SCORING, test_size = TEST_SIZE ): # Fine! if _Scale: _Scaler = preprocessing.StandardScaler().fit(X = X) X = _Scaler.transform(X = X) if _CVType == "KFold": _Cross_Val = model_selection.KFold( n_splits = n_splits, random_state = random_state ) elif _CVType == "LeaveOneOut": _Cross_Val = model_selection.LeaveOneOut() elif _CVType == "ShuffleSplit": _Cross_Val = model_selection.ShuffleSplit( n_splits = n_splits, test_size = test_size, random_state = random_state ) else: raise Exception() _Grid = model_selection.GridSearchCV( estimator = Model, param_grid = param_grid, cv = _Cross_Val, scoring = scoring ) _Grid_Result = _Grid.fit( X = X, y = y ) return _Grid_Result
def leave_one_out_knn_diversity(images_paths, size, k=3): """ summarize the distance to k closest images in the sets :note: put all images into memory so the number of images int the set should be reasonable :param images_paths: paths to the imgs :param size: size of the imgs :returns a tuple (mean, std, min, max) of the distances """ loo = model_selection.LeaveOneOut() images = decode_images(images_paths, size) images = np.reshape( images, [len(images), -1 ]) # Flatten for sklearn [#samples, #features] framework dists = [] for train_idx, test_idx in tqdm(loo.split(images)): train = images[train_idx] test = images[test_idx] d = knn_diversity_stats(train, test, k) dists.append(d) return np.average(dists), np.std(dists), np.amin(dists), np.amax(dists)
def UVECV(xTest, yTest, uveLv): # kf = model_selection.KFold(n_splits=5,random_state=10) loo = model_selection.LeaveOneOut() squareArray = np.array([[]]) coefs = np.array([[]]) for train, test in loo.split(xTest): xTrainTemp = xTest[train, :] yTrainTemp = yTest[train] xTestTemp = xTest[test, :] yTestTemp = yTest[test] yPredictTemp, plsModes = PLS(xTestTemp,yTestTemp,xTrainTemp,yTrainTemp,uveLv) coefTemp = plsModes.coef_.T if coefs.shape[1]==0: coefs = coefTemp else: coefs = np.append(coefs,coefTemp,axis=0) residual = yPredictTemp - yTestTemp square = np.dot(residual.T,residual) squareArray = np.append(squareArray,square) #squareArray.append(square) RMSECV = np.sqrt(np.sum(squareArray)/xTest.shape[0]) return RMSECV,coefs
def __init__(self): f = open("C:\\Users\\Admin\\Desktop\\Final.txt") f.readline() # skip the header data = np.loadtxt(f) X = data[:, 1:] # select columns 1 through end y = data[:, 0] # select column 0, the risk imageno = len(y) healthy = np.zeros(np.sum(y == 0), np.int) diseased = np.zeros(np.sum(y == 1), np.int) j = 0 k = 0 for i in range(0, imageno, 1): if y[i] == 0: healthy[j] = i j += 1 else: diseased[k] = i k += 1 loo = model_selection.LeaveOneOut(imageno) self.classify(X, y, healthy, diseased, loo)
def _ALGO_CMP( Models, X, y, _Scale = False, _Plot = False, _CVType = "KFold", n_splits = N_SPLITS, random_state = RANDOM_STATE, scoring = SCORING, test_size = TEST_SIZE ): # Fine! _Results = [] if _Scale: _Scaler = preprocessing.StandardScaler().fit(X = X) X = _Scaler.transform(X = X) if _CVType == "KFold": _Cross_Val = model_selection.KFold( n_splits = n_splits, random_state = random_state ) elif _CVType == "LeaveOneOut": _Cross_Val = model_selection.LeaveOneOut() elif _CVType == "ShuffleSplit": _Cross_Val = model_selection.ShuffleSplit( n_splits = n_splits, test_size = test_size, random_state = random_state ) else: raise Exception() for _Each in Models: _CVResult = model_selection.cross_val_score( estimator = Models[_Each], X = X, y = y, scoring = SCORING, cv = _Cross_Val ) _Results.append(( _Each, _CVResult )) if _Plot: plt.title("Comparison") plt.boxplot( x = [ _Results[i][1] for i in range(len(_Results)) ], labels = Models.keys() ) plt.show() _Best_Model = _Results[0] for i in range( 1, len(_Results) ): if _Results[i][1].mean() > _Best_Model[1].mean(): _Best_Model = _Results[i] elif _Results[i][1].mean() == _Best_Model[1].mean(): if _Results[i][1].std() > _Best_Model[1].std(): _Best_Model = _Results[i] _Best_Model = Models[_Best_Model[0]] return _Results, _Best_Model
def train_svm(train, test, leave_one_out=False, dim_reduc=None, norms=True, kernel="LinearSVC", final_pred=False): """ Function to train svm :param train: train data... (in panda dataframe) :param test: test data (itou) :param leave_one_out: whether or not to perform leave-one-out cross validation :param dim_reduc: dimensionality reduction of input data. Implemented values are pca and som. :param norms: perform normalisations, i.e. z-scores and L2 (default True) :param kernel: kernel for SVM :param final_pred: do the final predictions? :return: returns a pipeline with a fitted svm model, and if possible prints evaluation and writes to disk: confusion_matrix.csv, misattributions.csv and (if required) FINAL_PREDICTIONS.csv """ print(".......... Formatting data ........") # Save the classes classes = list(train.loc[:, 'author']) train = train.drop(['author', 'lang'], axis=1) if test is not None: classes_test = list(test.loc[:, 'author']) test = test.drop(['author', 'lang'], axis=1) preds_index = list(test.index) nfeats = train.columns.__len__() # CREATING PIPELINE print(".......... Creating pipeline according to user choices ........") estimators = [] if dim_reduc == 'pca': print(".......... using PCA ........") estimators.append(('dim_reduc', decomp.PCA())) # chosen with default # wich is: n_components = min(n_samples, n_features) # if dim_reduc == 'som': # print(".......... using SOM ........") # TODO: fix SOM # som = minisom.MiniSom(20, 20, nfeats, sigma=0.3, learning_rate=0.5) # initialization of 50x50 SOM # # TODO: set robust defaults, and calculate number of columns automatically # som.train_random(train.values, 100) # # too long to compute # # som.quantization_error(train) # print(".......... assigning SOM coordinates to texts ........") # train = som.quantization(train.values) # test = som.quantization(test.values) if norms: # Z-scores # TODO: me suis embeté à implémenter quelque chose qui existe # déjà via sklearn.preprocessing.StandardScaler() print(".......... using normalisations ........") estimators.append(('scaler', preproc.StandardScaler())) # scaler = preproc.StandardScaler().fit(train) # train = scaler.transform(train) # test = scaler.transform(test) # feat_stats = pandas.DataFrame(columns=["mean", "std"]) # feat_stats.loc[:, "mean"] = list(train.mean(axis=0)) # feat_stats.loc[:, "std"] = list(train.std(axis=0)) # feat_stats.to_csv("feat_stats.csv") # # for col in list(train.columns): # if not train[col].sum() == 0: # train[col] = (train[col] - train[col].mean()) / train[col].std() # # for index, col in enumerate(test.columns): # if not test.iloc[:, index].sum() == 0: # # keep same as train if possible # if not feat_stats.loc[index,"mean"] == 0 and not feat_stats.loc[index,"std"] == 0: # test.iloc[:,index] = (test.iloc[:,index] - feat_stats.loc[index,"mean"]) / feat_stats.loc[index,"std"] # # else: # test.iloc[:, index] = (test.iloc[:, index] - test.iloc[:, index].mean()) / test.iloc[:, index].std() # NB: je ne refais pas la meme erreur, et cette fois j'utilise le built-in # normalisation L2 # cf. https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html#sklearn.preprocessing.Normalizer estimators.append(('normalizer', preproc.Normalizer())) # transformer = preproc.Normalizer().fit(train) # train = transformer.transform(train) # transformer = preproc.Normalizer().fit(test) # test = transformer.transform(test) print(".......... choosing SVM ........") # let's try a standard one: only with PCA, otherwise too hard # if withPca: # classif = sk.SVC(kernel='linear') # else: # try a faster one # classif = sk.LinearSVC() if kernel == "LinearSVC": # try a faster one estimators.append(('model', sk.LinearSVC())) # classif = sk.LinearSVC() else: estimators.append(('model', sk.SVC(kernel=kernel))) # classif = sk.SVC(kernel=kernel) print(".......... Creating pipeline with steps ........") print(estimators) pipe = skp.Pipeline(estimators) # Now, doing leave one out validation or training single SVM with train / test split if leave_one_out: loo = skmodel.LeaveOneOut() print( ".......... leave-one-out cross validation will be performed ........" ) print(".......... using " + str(loo.get_n_splits(train)) + " samples ........") # Will need to # 1. train a model # 2. get prediction # 3. compute score: precision, recall, F1 for all categories skmodel.cross_val_score(pipe, train, classes, cv=loo) # Create the preds array preds = np.array([], dtype='<U9') for train_index, test_index in loo.split(train): # print(test_index) pipe.fit(train.iloc[train_index, ], [classes[i] for i in list(train_index)]) preds = np.concatenate( (preds, pipe.predict(train.iloc[test_index, ]))) # and now, leave one out evaluation (very small redundancy here, one line that could be stored elsewhere) unique_labels = list(set(classes)) pandas.DataFrame(metrics.confusion_matrix(classes, preds, labels=unique_labels), index=['true:{:}'.format(x) for x in unique_labels], columns=['pred:{:}'.format(x) for x in unique_labels ]).to_csv("confusion_matrix.csv") print(metrics.classification_report(classes, preds)) # writing misattributions pandas.DataFrame( [ i for i in zip(list(train.index), list(classes), list(preds)) if i[1] != i[2] ], columns=["id", "True", "Pred"]).set_index('id').to_csv("misattributions.csv") # and now making the model for final preds after leave one out if necessary if final_pred: print(".......... Training final SVM with all train set ........") pipe.fit(train, classes) preds = pipe.predict(test) pandas.DataFrame(data={ 'filename': preds_index, 'author': list(preds) }).to_csv("FINAL_PREDICTIONS.csv") # And now the simple case where there is only one svm to train else: pipe.fit(train, classes) preds = pipe.predict(test) # and evaluate unique_labels = list(set(classes + classes_test)) pandas.DataFrame(metrics.confusion_matrix(classes_test, preds, labels=unique_labels), index=['true:{:}'.format(x) for x in unique_labels], columns=['pred:{:}'.format(x) for x in unique_labels ]).to_csv("confusion_matrix.csv") print(metrics.classification_report(classes_test, preds)) # AND NOW, we need to evaluate or create the final predictions if final_pred: pandas.DataFrame(data={ 'filename': preds_index, 'author': list(preds) }).to_csv("FINAL_PREDICTIONS.csv") return pipe
def setup_indices(self, train_data, test_data): splitter = skl.LeaveOneOut() splitter.get_n_splits(test_data) self.indices = list(splitter.split(test_data))
X_test = sc.transform(X_test) #scale X_test from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(C=0.00018263636363636363, penalty='none', solver='sag') #cv = leave one out #classifier = LogisticRegression(C = 1e-06, penalty = 'none', solver = 'sag') #cv = 10 #classifier = LogisticRegression(C = 8.172727272727273e-05, penalty = 'l2', solver = 'sag') #cv = 5 classifier.fit(X, y) y_pred = classifier.predict(X_test) #Model evaluation leave_one_out = model_selection.LeaveOneOut() from sklearn.model_selection import cross_val_score from sklearn.metrics import confusion_matrix, accuracy_score accuracies = cross_val_score(estimator=classifier, X=X, y=y, cv=leave_one_out) print("Training Accuracy = {:.2f} %".format(accuracies.mean() * 100)) cm = confusion_matrix(y_test, y_pred) #(real values, predicted values) print("Testing accuracy = {:.2f} %".format( accuracy_score(y_test, y_pred) * 100)) print("TN =", cm[0][0], "TP =", cm[1][1]) print("FP =", cm[0][1], "FN =", cm[1][0]) joblib.dump(classifier, 'model_logistic_regresssion.pkl')
def func(args): # input parameters atlasobj = args[0] ChanggungPatientNets = args[1] # return results ret = [None, None, None, None, None] # discover rate, accuracy, precision, recall, specificity ChanggungHealthyNets = io_utils.loadRandomDynamicNets( ChanggungAllFullPath, atlasobj, totalNum=len(ChanggungPatientNets), scanList=os.path.join(ChanggungRootPath, 'normal_scans.txt')) sig_connections = stats_utils.filter_sigdiff_connections( ChanggungPatientNets, ChanggungHealthyNets) ret[0] = float(len(sig_connections)) / (atlasobj.count * (atlasobj.count - 1) / 2.0) X1 = np.zeros((len(ChanggungHealthyNets), 1)) # healthy y1 = -1 * np.ones((len(ChanggungHealthyNets), 1)) X2 = np.zeros((len(ChanggungPatientNets), 1)) # patient y2 = np.ones((len(ChanggungPatientNets), 1)) for c in sig_connections: normalCList = result_utils.getAllFCAtIdx(c[0], c[1], ChanggungHealthyNets) X1 = np.insert(X1, 0, normalCList, axis=1) patientCList = result_utils.getAllFCAtIdx(c[0], c[1], ChanggungPatientNets) X2 = np.insert(X2, 0, patientCList, axis=1) X = np.concatenate([X1[:, :-1], X2[:, :-1]]) y = np.concatenate((y1, y2)).ravel() # classifier classifier = svm.SVC(kernel='linear') # leave one out cross validation accuracy = [] truePositive = 0 falsePositive = 0 trueNegative = 0 falseNegative = 0 loo = model_selection.LeaveOneOut() for trainIdx, testIdx in loo.split(X, y): classifier.fit(X[trainIdx, :], y[trainIdx]) accuracy.append(classifier.score(X[testIdx, :], y[testIdx])) p = classifier.predict(X[testIdx, :])[0] if p == 1 and y[testIdx] == 1: truePositive += 1 elif p == -1 and y[testIdx] == -1: trueNegative += 1 elif p == 1 and y[testIdx] == -1: falsePositive += 1 else: falseNegative += 1 ret[1] = np.mean(accuracy) precision = float(truePositive) / (truePositive + falsePositive) recall = float(truePositive) / (truePositive + falseNegative) specificity = float(trueNegative) / (trueNegative + falsePositive) ret[2] = precision ret[3] = recall ret[4] = specificity return ret
def SFBS(self): """ Set the regression scheme """ if self.objFunction == 'MLR': self.ObjFunctionRun = MultipleRegression elif self.objFunction == 'PCAR': self.ObjFunctionRun = PrincipalComponentsRegression elif self.objFunction == 'ZSCR': self.ObjFunctionRun = ZScoreRegression elif self.objFunction == 'ANN': self.ObjFunctionRun = NeuralNetwork """ Set the Cross validation type """ if self.crossVal == 'Leave One Out': self.cv = model_selection.LeaveOneOut() elif self.crossVal == 'K-Fold (5 folds)': self.cv = model_selection.KFold(n_splits=5) else: self.cv = model_selection.KFold(n_splits=10) """ Get the predictand Data""" self.predictandData = pd.DataFrame().from_dict( self.equationDict['Predictand']['Data'], orient='columns') self.predictandData.columns = ['Predictand'] """ Initialize data for predictors """ self.predictorData = pd.DataFrame() for predictorName in self.predictorDict: for interval in self.predictorDict[predictorName]: if self.predictorDict[predictorName][interval][ 'prdID'] in list(self.equationDict['PredictorPool']): self.predictorData = pd.concat([ self.predictorData, pd.DataFrame().from_dict( self.predictorDict[predictorName][interval] ['Data'], orient='columns') ], axis=1) self.predictorDataNames = list(self.predictorData.columns) """ Initialize a list of dictionarys to store model information """ self.searchDictList = [{ "fcstID": "", "Type": "Linear - {0}".format(self.objFunction), "Coef": [], "prdIDs": self.predictorDataNames, "Intercept": [], "PrincCompData": {}, "Metrics": { "Cross Validated Adjusted R2": -1e4, "Root Mean Squared Prediction Error": 1e5, "Cross Validated Nash-Sutcliffe": -1e4, "Adjusted R2": -1e4, "Root Mean Squared Error": 1e5, "Nash-Sutcliffe": -1e4, "Sample Variance": 1e5 }, "CrossValidation": self.crossVal, "Forecasted": "", "CV_Forecasted": "", "Years Used": [], "FeatSelectionProgress": "Running" } for n in range(self.numModels)] """ Begin a loop to iterate through parallized floating selection """ iterCounter = 0 modelsAnalyzed = 0 modelsCompleted = 0 """ Array to store current models """ currentModels = [ self.predictorDataNames for i in range(self.numModels) ] """ Set up a multiprocessing pool """ pool = ThreadPool(processes=CPUCount() - 1) while iterCounter < 1000: iterCounter = iterCounter + 1 print('iteration: ', iterCounter) input("continue with next iteration...") """ Iterate through each model and perform 1 iteration of Sequential Floating Selection """ for i in range(self.numModels): input() """ Check to see if this model has completed yet""" if self.searchDictList[i][ 'FeatSelectionProgress'] == 'Completed': continue """ Set some variables for this iteration """ modelChanged = False currentPredictorSet = self.searchDictList[i]['prdIDs'] predictorsToBeRemoved = currentPredictorSet print(""" Model Number: {0} current predictor set: {1} predictors to try and remove: {2} """.format(i, currentPredictorSet, predictorsToBeRemoved)) results = list( map(testPredictorSet, [ list(l) for l in zip( repeat(currentPredictorSet), predictorsToBeRemoved, repeat('Remove'), repeat(currentModels), repeat(self.cv), repeat(self.perfMetric), repeat(self.predictorData), repeat(self.predictandData), repeat(self.ObjFunctionRun), repeat(pool)) ])) """ Determine if any of the removals increased model performance """ for result in results: print("") input() print(""" ) We tried removing predictor: {0} The new metrics are: {1} the new predictor set is: {2} """.format( list( set(currentPredictorSet) - set(result[0]['prdID'])), result[1], result[0]['prdID'])) if result[0]['prdID'] == ['000'] or result[0]['prdID'] == [ '-1000' ]: continue if Metrics.metricBetterThan( newMetric=result[1][self.perfMetric], oldMetric=self.searchDictList[i]['Metrics'][ self.perfMetric], perfMeasure=self.perfMetric): predictorRemoved = list( set(currentPredictorSet) - set(result[0]['prdID'])) self.searchDictList[i]['Metrics'] = result[1] self.searchDictList[i]['prdIDs'] = result[0]['prdID'] self.searchDictList[i]['Forecasted'] = result[2][ 'Forecasted'] self.searchDictList[i]['CV_Forecasted'] = result[2][ 'CV_Forecasted'] self.searchDictList[i]['Coef'] = result[3] self.searchDictList[i]['Intercept'] = result[4] self.searchDictList[i]['PrincCompData'] = result[5] currentModels[i] = result[0]['prdID'] modelChanged = True modelsAnalyzed = modelsAnalyzed + 1 self.signals.updateRunLabel.emit( "Models Analyzed: {0}".format(modelsAnalyzed)) """ If we didn't remove a predictor, attempt to skip a step and try removing 2 predictors """ # if modelChanged == False: # predictorsToBeRemoved = list(combinations(currentPredictorSet, 2)) # results = list(map(testPredictorSet, [list(l) for l in zip( repeat(currentPredictorSet), # predictorsToBeRemoved, # repeat('Remove'), # repeat(currentModels), # repeat(self.cv), # repeat(self.perfMetric), # repeat(self.predictorData), # repeat(self.predictandData), # repeat(self.ObjFunctionRun), # repeat(pool))])) # for result in results: # if result[0]['prdID'] == '000': # continue # if Metrics.metricBetterThan( newMetric = result[1][self.perfMetric], oldMetric = self.searchDictList[i]['Metrics'][self.perfMetric], perfMeasure = self.perfMetric): # predictorRemoved = list(set(currentPredictorSet) - set(result[0]['prdID']) ) # self.searchDictList[i]['Metrics'] = result[1] # self.searchDictList[i]['prdIDs'] = result[0]['prdID'] # self.searchDictList[i]['Forecasted'] = result[2]['Forecasted'] # self.searchDictList[i]['CV_Forecasted'] = result[2]['CV_Forecasted'] # self.searchDictList[i]['Coef'] = result[3] # self.searchDictList[i]['Intercept'] = result[4] # self.searchDictList[i]['PrincCompData'] = result[5] # currentModels[i] = result[0]['prdID'] # modelChanged = True # modelsAnalyzed = modelsAnalyzed + 1 # self.signals.updateRunLabel.emit("Models Analyzed: {0}".format(modelsAnalyzed)) """ Try and add a variable back in, but don't add in a predictor we just removed """ currentPredictorSet = self.searchDictList[i]['prdIDs'] if modelChanged == True: predictorsToBeAdded = list( set([ prd for prd in self.predictorDataNames if prd not in currentPredictorSet ]) - set(predictorRemoved)) else: predictorsToBeAdded = [ prd for prd in self.predictorDataNames if prd not in currentPredictorSet ] results = list( map(testPredictorSet, [ list(l) for l in zip( repeat(currentPredictorSet), predictorsToBeAdded, repeat('Add'), repeat(currentModels), repeat(self.cv), repeat(self.perfMetric), repeat(self.predictorData), repeat(self.predictandData), repeat(self.ObjFunctionRun), repeat(pool)) ])) """ Determine if any of the additions increased model performance """ for result in results: if result[0]['prdID'] == ['000']: continue if Metrics.metricBetterThan( newMetric=result[1][self.perfMetric], oldMetric=self.searchDictList[i]['Metrics'][ self.perfMetric], perfMeasure=self.perfMetric): predictorRemoved = list( set(currentPredictorSet) - set(result[0]['prdID'])) self.searchDictList[i]['Metrics'] = result[1] self.searchDictList[i]['prdIDs'] = result[0]['prdID'] self.searchDictList[i]['Forecasted'] = result[2][ 'Forecasted'] self.searchDictList[i]['CV_Forecasted'] = result[2][ 'CV_Forecasted'] self.searchDictList[i]['Coef'] = result[3] self.searchDictList[i]['Intercept'] = result[4] self.searchDictList[i]['PrincCompData'] = result[5] currentModels[i] = result[0]['prdID'] modelChanged = True modelsAnalyzed = modelsAnalyzed + 1 self.signals.updateRunLabel.emit( "Models Analyzed: {0}".format(modelsAnalyzed)) """ If the model hasn't changed, complete the model and update the progress bar """ if modelChanged == False and currentPredictorSet != []: self.searchDictList[i][ 'FeatSelectionProgress'] = 'Completed' modelsCompleted = modelsCompleted + 1 self.signals.updateProgBar.emit( int(100 * modelsCompleted / self.numModels)) for i in range(len(self.searchDictList)): if self.searchDictList[i]['prdIDs'] == []: fcstID = 'EMPTY' else: fcstID = encryptions.generateFcstID( self.searchDictList[i]['Type'], self.searchDictList[i]['prdIDs']) self.searchDictList[i]['fcstID'] = fcstID pool.close() pool.join() self.signals.returnFcstDict.emit(self.searchDictList)
# exercise 7.1.2 from matplotlib.pyplot import figure, plot, xlabel, ylabel, show import numpy as np from scipy.io import loadmat from sklearn.neighbors import KNeighborsClassifier from sklearn import model_selection # requires data from exercise 1.5.1 from ex1_5_1 import * # Maximum number of neighbors L = 40 CV = model_selection.LeaveOneOut() errors = np.zeros((N, L)) i = 0 for train_index, test_index in CV.split(X, y): print('Crossvalidation fold: {0}/{1}'.format(i + 1, N)) # extract training and test set for current CV fold X_train = X[train_index, :] y_train = y[train_index] X_test = X[test_index, :] y_test = y[test_index] # Fit classifier and classify the test points (consider 1 to 40 neighbors) for l in range(1, L + 1): knclassifier = KNeighborsClassifier(n_neighbors=l) knclassifier.fit(X_train, y_train) y_est = knclassifier.predict(X_test)
def loo_regression(standardized_X, Y): 'Leave one out regression and provide the mean absolute error for train test' # convert to numpy arrays Y = np.array(Y) standardized_X = np.array(standardized_X) loocv = model_selection.LeaveOneOut() reg1 = LinearRegression() reg2 = Ridge() print(reg2) reg3 = Lasso() print(reg3) reg4 = ElasticNet() print(reg4) reg5 = xgboost.XGBRegressor(max_depth=3,reg_lambda=1,reg_alpha=1) print(reg5) reg7 = RandomForestRegressor(n_estimators=10,max_depth=3) #, max_features=10 print(reg7) #reg8 = SVR(kernel='poly') regs = [reg1, reg2, reg3, reg4, reg5, reg7] classifier_name = ['LinearRegression', 'Ridge', 'Lasso', 'ElasticNet', 'XGB','RF'] table = [] for i in range(0, len(classifier_name)): print('\n') print('Classifier: ', classifier_name[i]) reg = regs[i] loop = 0 temp = [] temp_summary = [] temp_train_err = [] temp_test_err = [] for train_index, test_index in loocv.split(standardized_X): loop = loop + 1 # print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = standardized_X[train_index], standardized_X[test_index] y_train, y_test = Y[train_index], Y[test_index] # train_accuracy.append(reg.score(X_train, y_train)) reg.fit(X_train, y_train) y_train_predict = reg.predict(X_train) train_error = mean_absolute_error(y_train, y_train_predict) y_test_predict = reg.predict(X_test) test_error = mean_absolute_error(y_test, y_test_predict) temp.append([ y_test[0],round(y_test_predict[0],2) ]) temp_train_err.append(train_error) temp_test_err.append(test_error) train_mean = round(np.mean(temp_train_err),2) train_std = round(np.std(temp_train_err),2) test_mean = round(np.mean(temp_test_err),2) test_std = round(np.std(temp_test_err),2) temp_summary.extend([classifier_name[i],train_mean, train_std, test_mean, test_std]) table.append(temp_summary) temp = pd.DataFrame(temp) filename3 = 'output/moreno_corrupt/Val_Result_' + classifier_name[i] + '.csv' temp.to_csv(filename3,header=False, index=False) table = pd.DataFrame(table) table.to_csv('output/moreno_corrupt/result_summary.csv',header=False, index=False) print(table)
def correlation_supervisor(path, rootdir, simple=False, lig_only=False, max_descriptors=False): # load the files from the given input file file_dict, fail_dict = accquire_file(path) #loop over sucessful imports to get descriptors: big_mat = list() col_names = list() for i, keyv in enumerate(file_dict.keys()): file_dict[keyv].get_descriptor_vector(lig_only, simple, name=False, loud=False) #print('i = ',str(i)) if i == 0: col_names = file_dict[keyv].descriptor_names # reorganize the data this_row = list() this_row.append(float(file_dict[keyv].yvalue)) this_row.extend(file_dict[keyv].descriptors) big_mat.append(this_row) big_mat = np.array(big_mat) ##### let's do some regression ### standardize model: col_array = np.array(col_names) print('length of col array is ' + str(len(col_array))) n_tot = len(col_array) X = big_mat[:, 1:] print('dimension of data matrix is ' + str(big_mat.shape)) n_obs = len(X[:, 1]) Scaler = preprocessing.StandardScaler().fit(X) Xs = Scaler.transform(X) Y = big_mat[:, 0] ## find baseline model (all descriptors) Reg = linear_model.LinearRegression() Reg.fit(Xs, Y) Ypred_all_all = Reg.predict(Xs) rs_all_all = metrics.r2_score(Y, Ypred_all_all) loo = model_selection.LeaveOneOut() r_reduce = list() mse_reduce = list() ### stepwise reduce the feature set until only one is left for n in range(0, n_tot): reductor = feature_selection.RFE(Reg, n_tot - n, step=1, verbose=0) reductor.fit(Xs, Y) Ypred_all = reductor.predict(Xs) rs_all = metrics.r2_score(Y, Ypred_all) mse_all = metrics.mean_squared_error(Y, Ypred_all) r_reduce.append(rs_all) mse_reduce.append(mse_all) ### reduce to one feature reductor_features = list() for i, ranks in enumerate(reductor.ranking_): reductor_features.append([col_array[i], ranks]) reductor_features = sorted(reductor_features, key=lambda x: x[1]) #print(reductor_features) print('****************************************') ### select best number using cv selector = feature_selection.RFECV(Reg, step=1, cv=loo, verbose=0, scoring='neg_mean_squared_error') selector.fit(Xs, Y) select_mse = selector.grid_scores_ Ypred = selector.predict(Xs) rs = metrics.r2_score(Y, Ypred) n_opt = selector.n_features_ opt_features = col_array[selector.support_] ranked_features = list() for i, ranks in enumerate(selector.ranking_): ranked_features.append([col_array[i], ranks]) ranked_features = sorted(ranked_features, key=lambda x: x[1]) print(ranked_features) if max_descriptors: ## check if we need to reduce further print('a max of ' + str(max_descriptors) + ' were requested') n_max = int(max_descriptors) if n_opt > n_max: print('the RFE process selected ' + str(n_opt) + ' varibles as optimal') print('discarding an additional ' + str(n_max - n_opt) + ' variables') new_variables = list() new_mask = np.zeros(n_tot) for i in range(0, n_max): new_variables.append(ranked_features[i]) ## report results to user print('analzyed ' + str(n_obs) + ' molecules') print('the full-space R2 is ' + str("%0.2f" % rs_all_all) + ' with ' + str(n_tot) + ' features') print('optimal number of features is ' + str(n_opt) + ' of total ' + str(n_tot)) print('the opt R2 is ' + str("%0.2f" % rs)) #print(ranked_features) X_r = selector.transform(Xs) reg_red = linear_model.LinearRegression() reg_red.fit(X_r, Y) Ypred_r = reg_red.predict(X_r) errors = [Y[i] - Ypred_r[i] for i in range(0, n_obs)] coefs = reg_red.coef_ intercept = reg_red.intercept_ mse_all = metrics.mean_squared_error(Y, Ypred_all_all) mse_r = metrics.mean_squared_error(Y, Ypred_r) if n_opt < 30: print('the optimal variables are: ' + str(opt_features)) print('the coefficients are' + str(coefs)) else: print('the (first 30) optimal variables are: ' + str(opt_features[0:29])) print('the (first 30) coefficients are' + str(coefs[0:29])) print('the intercept is ' + str("%0.2f" % intercept)) print('the training MSE with the best feature set is ' + str("%0.2f" % mse_r)) print('the MSE with all features is ' + str("%0.2f" % mse_all)) print('by eliminating ' + str(n_tot - n_opt) + ' features,' + ' CV-prediction MSE decreased from ' + str("%0.0f" % abs(select_mse[0])) + ' to ' + str("%00f" % abs(select_mse[n_tot - n_opt]))) with open(rootdir + 'RFECV_rankings.csv', 'w') as f: f.write('RFE_rank,RFE_col,RFECV_rank,RFECV_col, \n') for i, items in enumerate(reductor_features): f.write( str(items[0]) + ',' + str(items[1]) + ',' + str(ranked_features[i][0]) + ',' + str(ranked_features[i][1]) + '\n') with open(rootdir + 'y_data.csv', 'w') as f: for items in Y: f.write(str(items) + '\n') with open(rootdir + 'y_pred_r.csv', 'w') as f: for items in Ypred_r: f.write(str(items) + '\n') with open(rootdir + 'optimal_decriptor_space.csv', 'w') as f: for i in range(0, n_obs): for j in range(0, n_opt): if j == (n_opt - 1): f.write(str(X_r[i][j]) + '\n') else: f.write(str(X_r[i][j]) + ',') with open(rootdir + 'full_descriptor_space.csv', 'w') as f: for names in col_names: f.write(names + ',') f.write('\n') for i in range(0, n_obs): for j in range(0, n_tot): if j == (n_tot - 1): f.write(str(Xs[i][j]) + '\n') else: f.write(str(Xs[i][j]) + ',') with open(rootdir + 'scaling.csv', 'w') as f: means = Scaler.mean_ var = Scaler.var_ f.write('name, mean,variance \n') for i in range(0, n_tot): f.write( str(col_names[i]) + ',' + str(means[i]) + ',' + str(var[i]) + ',' + str(selector.ranking_[i]) + '\n') with open(rootdir + 'coeficients.csv', 'w') as f: f.write('intercept,' + str(intercept) + '\n') for i in range(0, n_opt): f.write(str(opt_features[i]) + ',' + str(coefs[i]) + '\n') with open(rootdir + 'rfe_mse.csv', 'w') as f: f.write('features removed,mean CV error,' + str(intercept) + '\n') count = 0 for items in mse_reduce: f.write(str(count) + ',' + str(items) + '\n') count += 1