def doExp(datasetPath, targetEpsilon, numOfRounds): if os.path.basename(datasetPath).endswith('npy'): data = np.load(datasetPath); else: data = np.loadtxt(datasetPath, delimiter=","); rs = ShuffleSplit(n_splits=numOfRounds, test_size=1, random_state=0); rs.get_n_splits(data); print "Samples: %d, Features: %d" % (data.shape[0],data.shape[1]-1); # p = Pool(numOfRounds); cprResult = []; m = 0; for train_index, test_index in rs.split(data): print "Trail %d" % m; trainingData = data[train_index,1:]; tmpResult = singleExp(trainingData,targetEpsilon); cprResult.extend(tmpResult); m += 1; # tmpResult = p.apply_async(singleExp, (xEpsilons,pureTrainingData,largestReducedFeature)); # cprResult += tmpResult.get(); # Compute the average value after numOfRounds experiments. # p.close(); # p.join(); return cprResult;
def get_features_importance(X, y, variables): """ Run random forest on data, multiple times. Look for loss of f1_score for each variable. Return sorted list of most important variables. :param X: train array :param y: target array :param variables: list of variables to look at, in right order. See ComputeFeatures.handled_variables. :return: dictionary with list of f1_score for each variable. """ sp = ShuffleSplit(n_splits=5, test_size=.2) sp.get_n_splits(X) scores = defaultdict(list) names = variables for train_idx, test_idx in sp.split(X): x_train, x_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] model = DecisionTreeClassifier() model.fit(x_train, y_train) acc = f1_score(y_test, model.predict(x_test)) for i in range(X.shape[1]): X_t = x_test.copy() np.random.shuffle(X_t[:, i]) shuff_acc = f1_score(y_test, model.predict(X_t)) scores[names[i]].append((acc - shuff_acc) / acc) print("Features sorted by their score:") print( sorted([(np.round(np.mean(score), 4), feat) for feat, score in scores.items()], reverse=True)) return scores
def fit_model(X, y): """ Performs grid search over the 'max_depth' parameter for a decision tree regressor trained on the input data [X, y]. """ # Create cross-validation sets from the training data cv_sets = ShuffleSplit(n_splits=10, test_size=0.20, random_state=0) cv_sets.get_n_splits(X) # TODO: Create a decision tree regressor object regressor = DecisionTreeRegressor() # TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10 params = {'max_depth': [1, 2, 4, 5, 6, 7, 8, 9, 10]} # TODO: Transform 'performance_metric' into a scoring function using 'make_scorer' scoring_fnc = make_scorer(performance_metric) # TODO: Create the grid search cv object --> GridSearchCV() # Make sure to include the right parameters in the object: # (estimator, param_grid, scoring, cv) which have values 'regressor', 'params', 'scoring_fnc', and 'cv_sets' respectively. grid = GridSearchCV(regressor, params, scoring_fnc, cv=cv_sets) # Fit the grid search object to the data to compute the optimal model grid = grid.fit(X, y) # Return the optimal model after fitting the data return grid.best_estimator_
def split_data_train_val_test(px_fol): patients = os.listdir(px_fol) patients = np.asarray(patients) ss = ShuffleSplit(n_splits=1, test_size=0.20) ss.get_n_splits(patients) for train_index, test_index in ss.split(patients): xt, x_test = patients[train_index], patients[test_index] ss = ShuffleSplit(n_splits=1, test_size=0.20) ss.get_n_splits(xt) for ten_index, val_index in ss.split(xt): x_train_in, x_val_in = xt[ten_index], xt[val_index] px_splits = { 'train': np.ndarray.tolist(x_train_in), 'val': np.ndarray.tolist(x_val_in), 'test': np.ndarray.tolist(x_test) } return px_splits #%%
def train_model(label): label_index = labelSpace_dict[label] X, y = load_training_data() ## 将数据平均分成10份,9份用作训练,预测属性为二元属性 rs = ShuffleSplit(n_splits=10, test_size=.1, random_state=0) rs.get_n_splits(X) X_Fold = [] y_Fold = [] for train_index, test_index in rs.split(X): x_train = [] y_train = [] for i in train_index: x_train.append(X[i]) if y[i] == label_index: y_train.append(1) else: y_train.append(0) X_Fold.append(x_train) y_Fold.append(y_train) logre_classifier = [] ## 训练是10个二元分类模型 for i in range(0, X_Fold.__len__(), 1): classifier = LogisticRegression() classifier.fit(X_Fold[i], y_Fold[i]) logre_classifier.append(classifier) # print logre_classifier.__len__() return logre_classifier
def my_train_test_split(data_size, test_size=0.30): sss = ShuffleSplit(n_splits=1, test_size=test_size) X = np.reshape(np.random.rand(data_size * 2), (data_size, 2)) y = np.random.randint(2, size=data_size) sss.get_n_splits(X, y) train_index, test_index = next(sss.split(X, y)) return train_index, test_index
def make_donuts(n=4000, noise=0.2, factor=0.5, test_size=0.92, nneigh=5, mesh=False, mesh_step=0.02): X, y = datasets.make_circles(n_samples=n, noise=0.2, factor=0.5) adj = make_graph(X, nneigh) X = StandardScaler().fit_transform(X) sss = ShuffleSplit(n_splits=1, test_size=test_size) sss.get_n_splits(X, y) train_index, test_index = next(sss.split(X, y)) mesh_X = None mesh_adj = None xx = None yy = None if mesh: x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, mesh_step), np.arange(y_min, y_max, mesh_step)) mesh_X = np.c_[xx.ravel(), yy.ravel()] mesh_adj = make_graph(mesh_X, nneigh) # Might take a long time mesh_pack = (mesh_adj, mesh_X, xx, yy) return adj, X, y, train_index, test_index, test_index, mesh_pack
def generate_k_fold_cross_valid_idx(max_idx): """ generate indicies for each of the fold :param max_idx: how many data you have for each class, colorectal have 625 :return: """ trains = [] valids = [] tests = [] y = np.arange(max_idx) kf = KFold(n_splits=10) kf.get_n_splits(y) for train_index, test_index in kf.split(y): # print("TRAIN:", len(train_index), "TEST:", len(test_index)) yval = np.arange(len(train_index)) kf_val = ShuffleSplit(n_splits=1, test_size=0.15) kf_val.get_n_splits(yval) for train_idx, val_idx in kf_val.split(yval): final_train = train_index[train_idx] final_val = train_index[val_idx] final_test = test_index final_train.sort() final_val.sort() # print("TRAIN:", final_train, "VALID", final_val, "TEST:", final_test) trains.append(final_train) valids.append(final_val) tests.append(final_test) return trains, valids, tests
def shuffle(path): """ 打乱array :param path: Where U put data in the dir :return: """ X = np.loadtxt(path) y = X[:, -1].astype(np.int) X = X[:, :-1] rs = ShuffleSplit(n_splits=1, test_size=.25, random_state=0) rs.get_n_splits(X) # print(rs) for train_index, test_index in rs.split(X, y): # print("Train Index:", train_index, ",Test Index:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # print(X_train,X_test,y_train,y_test) print("==============================") print("Making dataset") # rs = ShuffleSplit(n_splits=3, train_size=.5, test_size=.25, random_state=0) np.savetxt(path + '_X_train', X_train, fmt='%d') print(path + '_X_train') np.savetxt(path + '_Y_train', y_train, fmt='%d') print(path + '_Y_train') np.savetxt(path + '_X_test', X_test, fmt='%d') print(path + '_X_test') np.savetxt(path + '_Y_test', y_test, fmt='%d') print(path + '_Y_test') # return X_train, X_test, y_train, y_test print("==============================") print('FINISHED !')
def doExp(datasetPath, epsilon, varianceRatio, numOfRounds, numOfDimensions, numOfSamples, isLinearSVM=True): data = np.loadtxt(datasetPath, delimiter=",") globalPCA = PCAModule.PCAImpl(data[:, 1:]) numOfFeature = data.shape[1] - 1 largestReducedFeature = globalPCA.getNumOfPCwithKPercentVariance( varianceRatio) print "%d/%d dimensions captures %.2f variance." % ( largestReducedFeature, numOfFeature, varianceRatio) xDimensions = None if numOfDimensions > numOfFeature: xDimensions = np.arange(1, numOfFeature) topK = numOfFeature else: xDimensions = np.arange( 1, largestReducedFeature, max(largestReducedFeature / numOfDimensions, 1)) topK = largestReducedFeature #cprResult = np.zeros((len(xDimensions),4)); cprResult = None rs = ShuffleSplit(n_splits=numOfRounds, test_size=.2, random_state=0) rs.get_n_splits(data) #p = Pool(numOfRounds); normalizedData = gf.normByRow(data[:, 1:]) normalizedData = np.concatenate((data[:, [ 0, ]], normalizedData), axis=1) for train_index, test_index in rs.split(data): trainingData = normalizedData[train_index] testingData = normalizedData[test_index] #tmpResult = p.apply_async(singleExp, (xDimensions,trainingData,testingData,topK,isLinearSVM)); #cprResult += tmpResult.get(); tmpResult = singleExp(xDimensions, trainingData, testingData, topK, isLinearSVM) if cprResult is None: cprResult = tmpResult else: cprResult = np.concatenate((cprResult, tmpResult), axis=0) """ for i in range(0,len(cprResult)): print ','.join(['%.3f' % num for num in cprResult[i]]); """ #avgResult = cprResult/numOfRounds; avgResult = cprResult #p.close(); #p.join(); for result in avgResult: print ','.join(['%.3f' % num for num in result]) return avgResult
def ModelLearning(X, y): """ Calculates the performance of several models with varying sizes of training data. The learning and testing scores for each model are then plotted. """ # Create 10 cross-validation sets for training and testing cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0) cv.get_n_splits(X) # Generate the training set sizes increasing by 50 train_sizes = np.rint(np.linspace(1, X.shape[0] * 0.8 - 1, 9)).astype(int) # Create the figure window fig = pl.figure(figsize=(10, 7)) # Create three different models based on max_depth for k, depth in enumerate([1, 3, 6, 10]): # Create a Decision tree regressor at max_depth = depth regressor = DecisionTreeRegressor(max_depth=depth) # Calculate the training and testing scores sizes, train_scores, test_scores = learning_curve( regressor, X, y, cv=cv, train_sizes=train_sizes, scoring='r2') # Find the mean and standard deviation for smoothing train_std = np.std(train_scores, axis=1) train_mean = np.mean(train_scores, axis=1) test_std = np.std(test_scores, axis=1) test_mean = np.mean(test_scores, axis=1) # Subplot the learning curve ax = fig.add_subplot(2, 2, k + 1) ax.plot(sizes, train_mean, 'o-', color='r', label='Training Score') ax.plot(sizes, test_mean, 'o-', color='g', label='Testing Score') ax.fill_between(sizes, train_mean - train_std, train_mean + train_std, alpha=0.15, color='r') ax.fill_between(sizes, test_mean - test_std, test_mean + test_std, alpha=0.15, color='g') # Labels ax.set_title('max_depth = %s' % (depth)) ax.set_xlabel('Number of Training Points') ax.set_ylabel('Score') ax.set_xlim([0, X.shape[0] * 0.8]) ax.set_ylim([-0.05, 1.05]) # Visual aesthetics ax.legend(bbox_to_anchor=(1.05, 2.05), loc='lower left', borderaxespad=0.) fig.suptitle('Decision Tree Regressor Learning Performances', fontsize=16, y=1.03) fig.tight_layout() fig.show()
def main(): if not LOAD_TEST_SPLIT: global X, y else: global X_train_dev, X_test, y_train_dev, y_test from sklearn.model_selection import ShuffleSplit, KFold if not LOAD_TEST_SPLIT: ss = ShuffleSplit(n_splits=1, test_size=0.1, random_state=0) ss.get_n_splits(X, y) train_index, test_index = next(ss.split(y)) X_train_dev, X_test = [X[i] for i in train_index ], [X[i] for i in test_index] y_train_dev, y_test = [y[i] for i in train_index ], [y[i] for i in test_index] kf = KFold(n_splits=NUM_FOLD, random_state=0) gold_list = None # all_preds = [] for i, (train_index, dev_index) in enumerate(kf.split(y_train_dev)): logger('STARTING Fold -----------', i + 1) X_train, X_dev = [X_train_dev[i] for i in train_index ], [X_train_dev[i] for i in dev_index] y_train, y_dev = [y_train_dev[i] for i in train_index ], [y_train_dev[i] for i in dev_index] gold_list, pred_list = train(X_train, y_train, X_dev, y_dev, X_test, y_test) # all_preds.append(pred_list) break # all_preds = np.stack(all_preds, axis=0) # shape = all_preds[0].shape # mj = np.zeros(shape) # for m in range(shape[0]): # for n in range(shape[1]): # mj[m, n] = find_majority(np.asarray(all_preds[:, m, n]).reshape((-1)))[0] final_pred = pred_list logger('Final test by majority voting:') show_classification_report(gold_list, final_pred) metric = get_metrics(gold_list, final_pred) logger('Normal: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4]) metric = get_multi_metrics(gold_list, final_pred) logger('Multi only: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4]) metric = get_single_metrics(gold_list, final_pred) logger('Single only: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4]) logger('Jaccard:', jaccard_score(gold_list, final_pred)) logger('Bert Binary', args) if args.output_path is not None: with open(args.output_path, 'bw') as _f: pkl.dump(final_pred, _f)
def ShuffleData_ecg_2(X, y): rs = ShuffleSplit(n_splits=30, test_size=0.25, random_state=42) rs.get_n_splits(X) for train_index, test_index in rs.split(X): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] return X_train, X_test, y_train, y_test
def rf_allMix(X, y, path, title, n_estimators, max_depth): # Create a Gaussian Classifier acc_app = [] precision_app = [] recall_app = [] f1_score_app = [] mean_absolut = [] y_testing = [] y_prediction = [] clf = RandomForestClassifier(n_estimators=n_estimators, max_features=len(X[0]), n_jobs=-1, max_depth=max_depth) rs = ShuffleSplit(n_splits=10, test_size=0.20, random_state=42) rs.get_n_splits(X) for train_index, test_index in rs.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Train the model using the training sets y_pred=clf.predict(X_test) clf.fit(X_train, y_train) # prediction on test set y_pred = clf.predict(X_test) y_prediction.extend(y_pred) y_testing.extend(y_test) accuracy = metrics.accuracy_score(y_test, y_pred) acc_app.append(accuracy) precision = metrics.precision_score(y_test, y_pred, average='micro') precision_app.append(precision) recall = metrics.recall_score(y_test, y_pred, average='micro') recall_app.append(recall) f1_score = metrics.f1_score(y_test, y_pred, average='micro') f1_score_app.append(f1_score) mean_absolut_error = metrics.mean_absolute_error(y_test, y_pred) mean_absolut.append(mean_absolut_error) performance_every_shuffler_allmix(y_pred, y_test, accuracy, precision, recall, f1_score, mean_absolut_error, path, title) ################################################## performance_global_shuffle_allmix(y_prediction, y_testing, acc_app, precision_app, recall_app, f1_score_app, mean_absolut, path, title)
def main(): if not LOAD_TEST_SPLIT: global X, y ALL_TRAINING = X else: global X_train_dev, X_test, y_train_dev, y_test ALL_TRAINING = X_train_dev + X_test glove_tokenizer.build_tokenizer(ALL_TRAINING, vocab_size=VOCAB_SIZE) glove_tokenizer.build_embedding(GLOVE_EMB_PATH, dataset_name=data_set_name) from sklearn.model_selection import ShuffleSplit, KFold if not LOAD_TEST_SPLIT: ss = ShuffleSplit(n_splits=1, test_size=0.1, random_state=0) ss.get_n_splits(X, y) train_index, test_index = next(ss.split(y)) X_train_dev, X_test = [X[i] for i in train_index], [X[i] for i in test_index] y_train_dev, y_test = [y[i] for i in train_index], [y[i] for i in test_index] kf = KFold(n_splits=args.folds, random_state=args.dev_split_seed) # kf.get_n_splits(X_train_dev) all_preds = [] gold_list = None for i, (train_index, dev_index) in enumerate(kf.split(y_train_dev)): logger('STARTING Fold -----------', i + 1) X_train, X_dev = [X_train_dev[i] for i in train_index], [X_train_dev[i] for i in dev_index] y_train, y_dev = [y_train_dev[i] for i in train_index], [y_train_dev[i] for i in dev_index] gold_list, pred_list, model = train(X_train, y_train, X_dev, y_dev, X_test, y_test) all_preds.append(pred_list) #torch.save(model.state_dict(), 'saved_model/emotion_classifier' + str(i+1) + '.pt') #break all_preds = np.stack(all_preds, axis=0) shape = all_preds[0].shape mj = np.zeros(shape[0]) for m in range(shape[0]): mj[m] = find_majority(np.asarray(all_preds[:, m]).reshape((-1)))[0] final_pred = mj print('TEST---------: ') show_classification_report(gold_list, final_pred) metric = get_metrics(gold_list, final_pred) logger('Normal: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4]) # metric = get_multi_metrics(gold_list, final_pred) # logger('Multi only: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4]) # metric = get_single_metrics(gold_list, final_pred) # logger('Single only: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4]) # logger('Final Jaccard:', jaccard_score(gold_list, final_pred)) logger(os.path.basename(__file__)) logger(args)
def multiTrain(data, label, test_data): best_train_acc = 0 best_val_acc = 0 result_list = [] train_acc_list = [] val_acc_list = [] best_epoch = None final_result = [] splitstate = ShuffleSplit(n_splits=18, test_size=.20) splitstate.get_n_splits(data, label) epoch = 0 time_str = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime(time.time())) for train_index, val_index in splitstate.split(data, label): print("epoch: ", epoch + 1) epoch += 1 print("TRAIN:", train_index, "TEST: ", val_index) sub_train_data = np.array([train_data[i] for i in train_index]) sub_train_label = np.array([train_label[i] for i in train_index]) sub_val_data = np.array([train_data[i] for i in val_index]) sub_val_label = np.array([train_label[i] for i in val_index]) train_acc, val_acc, test_prediction = xgbmodelc( sub_train_data, sub_train_label, sub_val_data, sub_val_label, test_data) if val_acc > best_val_acc: print("find a better val_acc: " + str(best_val_acc) + " -> " + str(val_acc)) best_train_acc = train_acc best_val_acc = val_acc best_epoch = epoch result_list.append(test_prediction) train_acc_list.append(train_acc) val_acc_list.append(val_acc) print("best_epoch: {}".format(epoch)) best_result = result_list[epoch - 1] best_result_save_name = "./best/best_result_" + time_str + ".csv" save_result(best_result_save_name, best_result) print(best_val_acc) print(train_acc_list) result_list = np.array(result_list) for i in range(17): counts = np.bincount(result_list[:, i]) index = np.argmax(counts) final_result.append(index) final_result = np.array(final_result) one_hots = to_categorical(final_result).astype(np.int32) csvfile = open('rank_result_1.csv', 'w', newline='') writer = csv.writer(csvfile) for i in one_hots: writer.writerow(i) csvfile.close() print("预测完毕!")
def tts_split(X, y, size, splits): '''Split the data in Train and test using the Shuffle split''' rs = ShuffleSplit(n_splits=splits, test_size=size) rs.get_n_splits(X) for train_index, test_index in rs.split(X, y): # print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] return X_train, X_test, y_train, y_test
def plot_learning_performance(regressor, X, y): """ Draw a graph that visualizes the learning curves of the model for both training and testing as the size of the training set is increased. Note that the shaded region of a learning curve denotes the uncertainty of that curve (measured as the standard deviation). The model is scored on both the training and testing sets using R2, the coefficient of determination. """ # Create 10 cross-validation sets for training and testing cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0) cv.get_n_splits(X) # Generate the training set sizes increasing by 50 train_sizes = np.rint(np.linspace(1, X.shape[0] * 0.8 - 1, 9)).astype(int) # Calculate the training and testing scores sizes, train_scores, test_scores = learning_curve(regressor, X, y, \ cv = cv, train_sizes = train_sizes, scoring = 'r2') # Find the mean and standard deviation for smoothing train_std = np.std(train_scores, axis=1) train_mean = np.mean(train_scores, axis=1) test_std = np.std(test_scores, axis=1) test_mean = np.mean(test_scores, axis=1) from matplotlib.pyplot import figure figure(num=None, figsize=(8, 5), dpi=80, facecolor='w', edgecolor='k') plt.title('') plt.xlabel('Number of Training Points') plt.ylabel('r2 score') plt.xlim([0, X.shape[0] * 0.8]) plt.ylim([-0.05, 1.05]) plt.plot(sizes, train_mean, 'o-', color='r', label='Training Score') plt.plot(sizes, test_mean, 'o-', color='g', label='Testing Score') plt.fill_between(sizes, train_mean - train_std, \ train_mean + train_std, alpha = 0.15, color = 'r') plt.fill_between(sizes, test_mean - test_std, \ test_mean + test_std, alpha = 0.15, color = 'g') # Visual aesthetics plt.legend(bbox_to_anchor=(0.4, 1.3), loc='lower left', borderaxespad=0.) plt.suptitle(type(regressor).__name__ + ' Learning Performances', fontsize=16, y=1.03) plt.show()
def doExp(datasetPath, epsilon, varianceRatio, numOfRounds, numOfPointsinXAxis, isLinearSVM=True): if os.path.basename(datasetPath).endswith('npy'): data = np.load(datasetPath) else: data = np.loadtxt(datasetPath, delimiter=",") numOfFeature = data.shape[1] - 1 scaler = StandardScaler() data_std = scaler.fit_transform(data[:, 1:]) globalPCA = PCAImpl(data_std) largestReducedFeature = globalPCA.getNumOfPCwithKPercentVariance( varianceRatio) print "%d/%d dimensions captures %.2f variance." % ( largestReducedFeature, numOfFeature, varianceRatio) cprResult = None #rs = StratifiedShuffleSplit(n_splits=numOfRounds, test_size=.2, random_state=0); #rs.get_n_splits(data[:,1:],data[:,0]); rs = ShuffleSplit(n_splits=numOfRounds, test_size=.2, random_state=0) rs.get_n_splits(data) for train_index, test_index in rs.split(data): #for train_index, test_index in rs.split(data[:,1:],data[:,0]): trainingData = data[train_index] testingData = data[test_index] print "number of training samples %d" % trainingData.shape[0] #tmpResult = p.apply_async(singleExp, (xDimensions,trainingData,testingData,topK,isLinearSVM)); #cprResult += tmpResult.get(); mostSamplesPerDataOwner = trainingData.shape[0] / 2 xSamples = np.arange( 2, mostSamplesPerDataOwner, max(mostSamplesPerDataOwner / numOfPointsinXAxis, 1)) print "number of samples be tested: %s" % xSamples tmpResult = singleExp(xSamples, trainingData, testingData, largestReducedFeature, epsilon, isLinearSVM) if cprResult is None: cprResult = tmpResult else: cprResult = np.concatenate((cprResult, tmpResult), axis=0) for result in cprResult: print ','.join(['%.3f' % num for num in result]) return cprResult
def bootstrap_runner(run_name): selected_efps = pd.read_csv( path / "results" / run_name / "selected_efps.csv" ) selected_efps = selected_efps.efp.tolist()[1:] X, y = grab_and_mix_data(selected_efps) n = len(y) n_train = int(0.85 * n) n_test = int(0.15 * n) rs = ShuffleSplit(n_splits=n_splits, random_state=0, test_size=0.15) rs.get_n_splits(X) ShuffleSplit(n_splits=n_splits, random_state=0, test_size=0.15) straps = [] aucs = [] bs_count = 0 for train_index, test_index in rs.split(X): X_train = X[train_index] y_train = y[train_index] X_val = X[test_index] y_val = y[test_index] model_file = f"{bs_model_dir}/bs-{bs_count}.h5" if not os.path.isfile(model_file): model = nn( X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, epochs=epochs, batch_size=batch_size, layers=layers, nodes=nodes, model_file=model_file, verbose=0, ) else: model = tf.keras.models.load_model(model_file) auc_val = roc_auc_score(y_val, np.hstack(model.predict(X_val))) # print(f" test-set AUC: {auc_val:.5}") straps.append(bs_count) aucs.append(auc_val) results = pd.DataFrame({"bs": straps, "auc": aucs}) results.to_csv(path / "results" / run_name / "bootstrap_results.csv") bs_count += 1 auc_mean = np.average(aucs) auc_std = np.std(aucs) print(f"AUC = {auc_mean:.5f} +/- {auc_std:.5f}")
def ModelComplexity(X, y): """ Calculates the performance of the model as model complexity increases. The learning and testing errors rates are then plotted. """ # Create 10 cross-validation sets for training and testing cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0) cv.get_n_splits(X) # Vary the max_depth parameter from 1 to 10 max_depth = np.arange(1, 11) # Calculate the training and testing scores train_scores, test_scores = validation_curve(DecisionTreeRegressor(), X, y, param_name="max_depth", param_range=max_depth, cv=cv, scoring='r2') # Find the mean and standard deviation for smoothing train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) # Plot the validation curve pl.figure(figsize=(7, 5)) pl.title('Decision Tree Regressor Complexity Performance') pl.plot(max_depth, train_mean, 'o-', color='r', label='Training Score') pl.plot(max_depth, test_mean, 'o-', color='g', label='Validation Score') pl.fill_between(max_depth, train_mean - train_std, train_mean + train_std, alpha=0.15, color='r') pl.fill_between(max_depth, test_mean - test_std, test_mean + test_std, alpha=0.15, color='g') # Visual aesthetics pl.legend(loc='lower right') pl.xlabel('Maximum Depth') pl.ylabel('Score') pl.ylim([-0.05, 1.05]) pl.show()
def doExp(datasetPath,varianceRatio,numOfRounds): if os.path.basename(datasetPath).endswith('npy'): data = np.load(datasetPath); else: data = np.loadtxt(datasetPath, delimiter=","); rs = ShuffleSplit(n_splits=numOfRounds, test_size=2, random_state=0); rs.get_n_splits(data); globalPCA = PCAImpl(data[:, 1:]); numOfFeature = data.shape[1] - 1; matrixRank = LA.matrix_rank(data[:, 1:]); print "Matrix rank of the data is %d." % matrixRank; largestReducedFeature = globalPCA.getNumOfPCwithKPercentVariance(varianceRatio); print "%d/%d dimensions captures %.2f variance." % (largestReducedFeature, numOfFeature, varianceRatio); xEpsilons = np.arange(0.1, 1.1, 0.1); # print xDimensions; # p = Pool(numOfRounds); # allResults = []; cprResult = []; m = 0; for train_index, test_index in rs.split(data): print "Trail %d" % m; trainingData = data[train_index]; pureTrainingData = trainingData[:, 1:]; tmpResult = singleExp(xEpsilons, pureTrainingData, largestReducedFeature); cprResult.extend(tmpResult); m += 1; # print tmpResult.shape; # print tmpResult; # tmpResult = p.apply_async(singleExp, (xEpsilons,pureTrainingData,largestReducedFeature)); # cprResult += tmpResult.get(); """ for i in range(0,len(cprResult)): print "%.4f,%.4f,%.4f" % (cprResult[i][0],cprResult[i][1],cprResult[i][2]); print "******************************"; """ # Compute the average value after numOfRounds experiments. # avgCprResult = cprResult/numOfRounds; # p.close(); # p.join(); for result in cprResult: print ','.join(['%.3f' % num for num in result]); return np.asarray(cprResult, dtype=float);
def run_SVM(x, y): ''' run cross validated SVM regression :param x: feature vectors :param y: labels :return: None ''' print 'SVM: ' rs = ShuffleSplit(n_splits=5, test_size=.20) rs.get_n_splits(x) split = 0 for train_index, test_index in rs.split(x): print "split", split x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] train_svm(x_train, x_test, y_train, y_test) split += 1
def trainTestSplit(x_data, y_data): X_train = [] y_train = [] X_test = [] y_test = [] rs = ShuffleSplit(n_splits=1, train_size=0.7, test_size=0.3, random_state=0) rs.get_n_splits(x_data) for train_index, test_index in rs.split(x_data, y_data): X_train, X_test = x_data[train_index], x_data[test_index] y_train, y_test = y_data[train_index], y_data[test_index] return X_train, y_train, X_test, y_test
def run_LR(x, y): ''' run cross validated logistic regression :param x: feature vectors :param y: labels :return: None ''' rs = ShuffleSplit(n_splits=5, test_size=.20) rs.get_n_splits(x) print 'Logistic Regression: ' split = 0 for train_index, test_index in rs.split(x): print "split", split x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] acc, per, recall = run_logreg(x_train, x_test, y_train, y_test) split += 1 return
def get_acc_auc_randomisedCV(X,Y,iterNo=5,test_percent=0.2): #TODO: First get the train indices and test indices for each iteration #Then train the classifier accordingly #Report the mean accuracy and mean auc of all the iterations sskf = ShuffleSplit(n_splits=iterNo, test_size=test_percent, random_state=RANDOM_STATE) sskf.get_n_splits(X) accuracies = [] aucs = [] for train_index, test_index in sskf.split(X): Y_pred = models_partc.logistic_regression_pred(X[train_index], Y[train_index], X[test_index]) #Y_pred = my_model.my_classifier_predictions(X[train_index],Y[train_index],X[test_index]) accuracy = accuracy_score(Y_pred, Y[test_index]) auc_score = roc_auc_score(sorted(Y_pred), sorted(Y[test_index])) accuracies.append(accuracy) aucs.append(auc_score) return np.mean(accuracies), np.mean(aucs)
def fit_model(X, y): rs = ShuffleSplit(n_splits=10, test_size=0.20, random_state=0) cv_sets = rs.get_n_splits(X) classifier = DecisionTreeClassifier(random_state=0) params = {"max_depth": range(1, 11)} scoring_fnc = make_scorer(performance_metric) grid = GridSearchCV(classifier, param_grid=params, scoring=scoring_fnc, cv=cv_sets) grid = grid.fit(X, y) print(pd.DataFrame(grid.cv_results_)) return grid.best_estimator_
def step_first_train(classfiers_dict): ## 将数据分成1份训练 X, y = load_training_data() rs = ShuffleSplit(n_splits=1, test_size=.1, random_state=0) rs.get_n_splits(X) x_train = [] y_train = [] X_test = [] y_test = [] for train_index, test_index in rs.split(X): for i in train_index: x_train.append(X[i]) y_train.append(y[i]) for i in test_index: X_test.append(X[i]) y_test.append(y[i]) step1_X = x_train step1_y = y_train ## 对每个类别作概率预测 new_X = [] for i in range(0, step1_X.__len__(), 1): pro_X = [0 * j for j in range(0, 24, 1)] for label in labelSpace_dict: label_index = labelSpace_dict[label] pro_total = 0.0 logre_classifier = classfiers_dict[label] for t in range(0, logre_classifier.__len__(), 1): clf = logre_classifier[t] # print step1_X[i] pro = clf.predict_proba(step1_X[i]) # print pro[0][1] pro_total += pro[0][1] pro_total = pro_total / logre_classifier.__len__() pro_X[label_index] = pro_total # print pro_X new_X.append(pro_X) return new_X, step1_y
def test(): filename=input("输入数据所在的文本文件的路径:") X,y=fileload(filename) X=norm(X) rs=ShuffleSplit(n_splits=3,test_size=.3,random_state=0) #划分测试集,训练集 rs.get_n_splits(X) for train_index,test_index in rs.split(X,y): X_train,X_test=X[train_index],X[test_index] y_train,y_test=y[train_index],y[test_index] group,labels=X_train,y_train m=np.shape(X_test)[0] result=[] for i in range(m): result.append(classify0(X_test[i],group,labels,50)) error=0 print("分类错误样本:") for i in range(45): if result[i]!=y_test[i]: error+=1 print(X_test[i]) print("错误率:") return error/m
def load_BMET_data(for_seq2emo=True, load_split=False): EMOS = ['anger', 'fear', 'joy', 'sadness', 'surprise', 'thankfulness'] EMOS_DIC = {} for idx, emo in enumerate(EMOS): EMOS_DIC[emo] = idx # data_pata = 'data/EmoSet_RemoveDup_GloveProcess_OneEmo.csv' data_pata = 'data/BMETv0.3.csv' df_data = pd.read_csv(data_pata) # extract the subset which only contains the full sentences. source = [] target = [] for index, row in df_data.iterrows(): next_token = str(row['text']).strip().split() if len(next_token) > MAX_LEN_DATA: next_token = next_token[:MAX_LEN_DATA] source.append(' '.join(next_token)) if for_seq2emo: a_target = [0, 2, 4, 6, 8, 10] label = row['label'].split() for emo in label: a_target[EMOS_DIC[emo]] = EMOS_DIC[emo] * 2 + 1 else: a_target = [0] * len(EMOS) label = row['label'].split() for emo in label: a_target[EMOS_DIC[emo]] = 1 target.append(a_target) if not load_split: return source, target, EMOS, EMOS_DIC, 'BMETv0.3' else: from sklearn.model_selection import ShuffleSplit X, y = source, target ss = ShuffleSplit(n_splits=1, test_size=0.2, random_state=999) ss.get_n_splits(X, y) train_index, test_index = next(ss.split(y)) X_train_dev, X_test = [X[i] for i in train_index], [X[i] for i in test_index] y_train_dev, y_test = [y[i] for i in train_index], [y[i] for i in test_index] return X_train_dev, y_train_dev, X_test, y_test, EMOS, EMOS_DIC, 'BMETv0.3'
from sklearn.model_selection import ShuffleSplit import numpy as np X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 2, 1, 2]) rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0) rs.get_n_splits(X) print(rs) ShuffleSplit(n_splits=3, random_state=0, test_size=0.25, train_size=None) for train_index, test_index in rs.split(X): print("TRAIN:", train_index, "TEST:", test_index) rs = ShuffleSplit(n_splits=3, train_size=0.5, test_size=.25, random_state=0) for train_index, test_index in rs.split(X): print("# TRAIN:", train_index, "TEST:", test_index)