def gen_sample_array(self): try: from sklearn.model_selection import StratifiedShuffleSplit except: print('Need scikit-learn for this functionality') s = StratifiedShuffleSplit(n_splits=self.n_splits, test_size=0.5) X = torch.randn(self.class_vector.size(0), 2).numpy() y = self.class_vector.numpy() s.get_n_splits(X, y) train_index, test_index = next(s.split(X, y)) return np.hstack([train_index, test_index])
def generate_balanced_splits(cls, samples, labels): #random_state=None for real random or random_state={seed number} sss = StratifiedShuffleSplit(n_splits=1, test_size=test_set_percentage, random_state=None) sss.get_n_splits(samples, labels) for train_index, test_index in sss.split(samples, labels): train_set = samples[train_index] train_labels = labels[train_index] test_set = samples[test_index] test_labels = labels[test_index] return train_set, train_labels, test_set, test_labels
def doExp(datasetPath, epsilon, varianceRatio, n_trails, numOfDimensions, logPath, isLinearSVM=True): if os.path.basename(datasetPath).endswith('npy'): data = np.load(datasetPath) else: #data = np.loadtxt(datasetPath, delimiter=","); data = pd.read_csv(datasetPath, delimiter=",", header=None).values scaler = StandardScaler() data_std = scaler.fit_transform(data[:, 1:]) globalPCA = PCAImpl(data_std) numOfFeature = data.shape[1] - 1 largestReducedFeature = globalPCA.getNumOfPCwithKPercentVariance( varianceRatio) print "%d/%d dimensions captures %.2f variance." % ( largestReducedFeature, numOfFeature, varianceRatio) xDimensions = None if numOfDimensions > numOfFeature: xDimensions = np.arange(1, numOfFeature) largestReducedFeature = numOfFeature else: xDimensions = np.arange( 1, largestReducedFeature, max(largestReducedFeature / numOfDimensions, 1)) cprResult = [] rs = StratifiedShuffleSplit(n_splits=n_trails, test_size=.2, random_state=0) rs.get_n_splits(data[:, 1:], data[:, 0]) for train_index, test_index in rs.split(data[:, 1:], data[:, 0]): trainingData = data[train_index] testingData = data[test_index] tmpResult = singleExp(xDimensions, trainingData, testingData, largestReducedFeature, epsilon, isLinearSVM) with open(logPath, "a") as f: np.savetxt(f, tmpResult, delimiter=",", fmt='%1.3f') cprResult.append(tmpResult) cprResult = np.vstack(cprResult) for result in cprResult: print ','.join(['%.3f' % num for num in result]) return cprResult
def split(dpath, proc_data_path): make_dir(proc_data_path + 'models') make_dir(proc_data_path + 'data/test') make_dir(proc_data_path + 'data/train/0') #not make_dir(proc_data_path + 'data/val/0') #not make_dir(proc_data_path + 'data/train/1') #open make_dir(proc_data_path + 'data/val/1') #open make_dir(proc_data_path + 'data/train/2') #checked make_dir(proc_data_path + 'data/val/2') #checked imgs_path = glob.glob(dpath + "/**/*.png", recursive=True) labels = [] for img in imgs_path: if img.find("not") != -1: labels.append(0) elif img.find("open") != -1: labels.append(1) else: labels.append(2) imgs_path = np.array(imgs_path) labels = np.array(labels) sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) sss.get_n_splits(imgs_path, labels) for train_index, test_index in sss.split(imgs_path, labels): X_train, X_test = imgs_path[train_index], imgs_path[test_index] y_train, y_test = labels[train_index], labels[test_index] t_cnt_per_cls = int((y_test.shape[0] / 3) / 2) tests = list( np.concatenate([(X_train[y_train == 0])[0:t_cnt_per_cls], (X_train[y_train == 1])[0:t_cnt_per_cls], (X_train[y_train == 2])[0:t_cnt_per_cls]])) for img_i, img in enumerate(X_train): img_name = (img.strip().replace("\\", "/").split("/"))[-1] if img_name[0] == '_': continue if img in tests: dst = proc_data_path + 'data/test/' + str( img_i) + "_" + img_name else: dst = proc_data_path + 'data/train/' + str( y_train[img_i]) + "/" + str(img_i) + "_" + img_name copyfile(img, dst) for img_i, img in enumerate(X_test): img_name = (img.strip().replace("\\", "/").split("/"))[-1] if img_name[0] == '_': continue dst = proc_data_path + 'data/val/' + str( y_test[img_i]) + "/" + str(img_i) + "_" + img_name copyfile(img, dst)
def gen_sample_array(self): try: from sklearn.model_selection import StratifiedShuffleSplit except: print('Need scikit-learn for this functionality') import numpy as np s = StratifiedShuffleSplit(n_splits=self.n_splits, test_size=0.5) X = th.randn(self.class_vector.size(0),2).numpy() y = self.class_vector.numpy() s.get_n_splits(X, y) train_index, test_index = next(s.split(X, y)) return np.hstack([train_index, test_index])
def main(_): paths, labels = None, None dirname, _ = ospath.split(ospath.abspath(__file__)) try: data_dir = dirname + '/../../data/cells' paths, labels = import_data(data_dir=data_dir, in_memory=False, extension=args.extension) monitored_data, monitored_label, unmonitored_data = split_mon_unmon( paths, labels) monitored_data, monitored_label, unmonitored_data = np.array( monitored_data), np.array(monitored_label), np.array( unmonitored_data) helpers.shuffle_data(unmonitored_data) unmon_train, unmon_test = unmonitored_data[:int( (1 - TEST_SIZE) * len(unmonitored_data))], unmonitored_data[int( (1 - TEST_SIZE) * len(unmonitored_data)):] sss = StratifiedShuffleSplit(n_splits=1, test_size=TEST_SIZE, random_state=123) sss.get_n_splits(monitored_data, monitored_label) for train_index, test_index in sss.split(monitored_data, monitored_label): X_train, X_test = monitored_data[train_index], monitored_data[ test_index] y_train, y_test = monitored_label[train_index], monitored_label[ test_index] X_train = np.append(X_train, unmon_train) X_test = np.append(X_test, unmon_test) y_train = np.append(y_train, [-1] * len(unmon_train)) y_test = np.append(y_test, [-1] * len(unmon_train)) store_data(X_test, 'X_test') store_data(y_test, 'y_test') stdout.write("Training on data...\n") run_model(X_train, in_memory=False) stdout.write("Finished running model.") break except KeyboardInterrupt: stdout.write("Interrupted, this might take a while...\n") exit(0)
def ratio_data_loader(self): """ :return: train_data and test_data updated """ test_size = 0.1 num_sol = 100 num_of_features = 200 pair_num = int(self.full_data[0].shape[0] / num_sol) # 20 is the num of solutions X_TR = [] X_TS = [] Y_TR = [] Y_TS = [] dataset_X = self.full_data[0].reshape(pair_num, num_sol, num_of_features) dataset_Y = self.full_data[1].reshape(pair_num, num_sol) for idx, pair_X in enumerate(dataset_X): pair_Y = dataset_Y[idx] stratSplit = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=42) stratSplit.get_n_splits(pair_X, pair_Y) for train_idx, test_idx in stratSplit.split(pair_X, pair_Y): X_train = pair_X[train_idx] Y_train = pair_Y[train_idx] X_test = pair_X[test_idx] Y_test = pair_Y[test_idx] X_TR.append(X_train) X_TS.append(X_test) Y_TR.append(Y_train) Y_TS.append(Y_test) X_TR = np.array(X_TR).reshape(-pair_num * int(test_size * num_sol), num_of_features) X_TS = np.array(X_TS).reshape(-pair_num * int(test_size * num_sol), num_of_features) Y_TR = np.array(Y_TR).reshape(-pair_num * int(test_size * num_sol), 1) Y_TS = np.array(Y_TS).reshape(-pair_num * int(test_size * num_sol), 1) self.train_data = (X_TR, Y_TR) self.test_data = (X_TS, Y_TS) print("train data shape: ", X_TR.shape) print("test data shape: ", X_TS.shape)
def CrossValidation(model): S = 5 sss = StratifiedShuffleSplit(n_splits=S, test_size=0.3) sss.get_n_splits(x, y) f, acc = 0, 0 for train_index, test_index in sss.split(x, y): x_train, x_test = x.iloc[train_index], x.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] print(x_train.shape, x_test.shape) model.fit(x_train, y_train) ypred = np.where(model.predict(x_test) > 0.5, 1, 0) f += f1_score(y_test, ypred) / S acc += accuracy_score(y_test, ypred) / S print(model) print(f, acc)
def gen_sample_array(self): try: from sklearn.model_selection import StratifiedShuffleSplit except ModuleNotFoundError: print('Need scikit-learn for this functionality') except Exception: print('There exists some errors in your scikit-learn installation') s = StratifiedShuffleSplit(n_splits=self.n_splits, test_size=0.5) X = th.randn(self.class_vector.size(0),2).numpy() y = self.class_vector.numpy() s.get_n_splits(X, y) train_index, test_index = next(s.split(X, y)) return np.hstack([train_index, test_index])
def get_stratified_sample(X,y,verbose=True,test_size=.2): """ return stratified sampled X and y X : x matrix(input) y : y matrix(output) test_size : fration of total data in test set """ sss = StratifiedShuffleSplit(n_splits=10, test_size=test_size, random_state=0) sss.get_n_splits(X, y) print(sss) for train_index, test_index in sss.split(X, y): if verbose: print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] return [X_train,X_test,y_train,y_test]
def metrics(X, Y, n_classes, norm): n_iter = 3000 # Ten fold cross sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0) sss.get_n_splits(X, Y) ms = np.zeros((n_classes, n_classes)) for train_index, test_index in sss.split(X, Y): print("TRAIN:", train_index.shape, "TEST:", test_index.shape) x_train, x_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] if (norm): normalize(x_train) normalize(x_test) mlp = MLPClassifier(hidden_layer_sizes=(16, 16, 16), max_iter=n_iter) mlp.fit(x_train, y_train) predictions = mlp.predict(x_test) ms += confusion_matrix(y_test, predictions) print("10 Foldcross Validation \n", ms) print("Accuracy: ", sum(ms.diagonal()) / np.sum(ms)) # Confusion matrix X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4) if (norm): normalize(X_train) normalize(X_test) mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=n_iter) mlp.fit(X_train, Y_train) predictions = mlp.predict(X_test) cm = confusion_matrix(Y_test, predictions) print("\nConfusion Matrix \n", cm) print("Accuracy: ", sum(cm.diagonal()) / np.sum(cm)) print("\nKappa Score\n", cohen_kappa_score(Y_test, predictions))
def compute(self,X,y,C,gamma, test_size=0.3, n_iterations = 5, training_set_minsize = 10, learning_curves_step = 20): assert len(X)==len(y) assert len(y)>training_set_minsize assert isinstance(C, (int, float)) assert isinstance(gamma, (int, float)) train_size=int( round( (1-test_size) * len(y) )) set_ripartitions = StratifiedShuffleSplit(n_splits=n_iterations, test_size = test_size) # CORRECT! n_iter=set_ripartitions.get_n_splits(X, y) # CORRECT! n_samples=X.shape[0] n_features=X.shape[1] m_list=range(training_set_minsize,train_size,learning_curves_step) tr_errors=np.zeros((len(m_list),1),dtype=np.float) cv_errors=np.zeros((len(m_list),1),dtype=np.float) for train,test in set_ripartitions.split(X, y): # CORRECT! X_tr,X_cv,y_tr,y_cv =X[train],X[test],y[train],y[test] idx=0 for m in m_list: y_mask = self.stratifiedShuffleMask(y_tr,m) x_mask = np.kron(np.ones((n_features,1)),y_mask).T reduced_X = X_tr[x_mask!=0].reshape(m,n_features) reduced_y = y_tr[y_mask!=0] """ TODO: Exercise 3 Read the code of the current method "compute" and understand what is happening. Once you have understood the code, try to understand the meaning of the stratifiedShuffleMask method. What is that method suppose to do? What do reduced_X and reduced_y contain? Then, compute for each m, the training error and the cross-validation error averaged by the different re-arranged dataset ripartitions, and store them relatively in the tr_errors and cv_errors numpy vectors, order by the idx index. """ raise Exception("One last effort! It is the last exercise.") idx+=1 result=dict() result["m_list"]=m_list result["tr_errors"]=tr_errors result["cv_errors"]=cv_errors
def eval_model(X, y, args): from sklearn.model_selection import StratifiedShuffleSplit kf = StratifiedShuffleSplit(n_splits=args.cv, random_state=args.samples[0]) kf.get_n_splits(X, y) partition_idx = 0 for train_idx, test_idx in kf.split(X, y): partition_idx += 1 (x_train, y_train), (x_test, y_test) = load_partition(train_idx, test_idx, X, y) calls = get_callbacks(args, partition_idx) for s_idx, seed in enumerate(args.samples): print('{} Training with SEED {}'.format(s_idx, seed)) weight_file_name = '{}-{}-partition_{}-seed_{}'.format( args.model_type, args.timestamp, partition_idx, s_idx) + '-epoch_{epoch:02d}-loss_{val_loss:.2f}.hdf5'
def searchMethodFun(): weightRange = [ dict(zip(range(0, 2), (1, values))) for values in range(14, 24) ] param_gridA, method = model_params(methodChoice) pipeA = pipeline.make_pipeline(preprocessing.StandardScaler(), method) # 产生指定数量的独立的train/test数据集划分, 首先对样本全体打乱, 然后划分出train/test对 # 返回分层划分, 创建划分的时候要保证每一个划分中类的样本比例与整体数据集中的原始比例保持一致 fscore = make_scorer(scoring_method, pos_label=1) sss = StratifiedShuffleSplit(n_splits=3, test_size=0.5, random_state=0) sss.get_n_splits(y_train) searchMethod = GridSearchCV(pipeA, param_grid=param_gridA, scoring=fscore, cv=sss, n_jobs=1) # searchMethod = RandomizedSearchCV(pipeA, param_distributions=param_gridA,n_iter=20) return searchMethod
def train_model(self): from keras import callbacks as C from sklearn.model_selection import StratifiedShuffleSplit self.compile() kf = StratifiedShuffleSplit(n_splits=1, random_state=13, test_size=0.1) kf.get_n_splits(self.x_train, self.y_train) for t_index, v_index in kf.split(self.x_train, self.y_train): X_train, X_val = self.x_train[t_index], self.x_train[v_index] Y_train, Y_val = self.y_train[t_index], self.y_train[v_index] val_data = (X_val, Y_val) self.fit(X_train, Y_train, val_data) return self.model
def __init__(self, dataset_name, split, out_path): uea_ucr_datasets.list_datasets() if split == 'testing': data = uea_ucr_datasets.Dataset(dataset_name, train=False) self.X, self.y = _to_array(data) elif split in ['training', 'validation']: validation_split_file = os.path.join(out_path, 'validation_split_file.pkl') data = uea_ucr_datasets.Dataset(dataset_name, train=True) X, y = _to_array(data) if not os.path.isfile(validation_split_file): print('Generating stratified training/validation split...') #now create the splits: from sklearn.model_selection import StratifiedShuffleSplit sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) #X_dummy = np.zeros([len(y),2]) sss.get_n_splits( X, y ) #for simpler NaN handling, we use dummy data for splitting, #as only labels are relevant training_indices, validation_indices = next(sss.split(X, y)) split_dict = { 'training': training_indices, 'validation': validation_indices } #save the split ids if not os.path.exists(out_path): os.makedirs(out_path, exist_ok=True) with open(validation_split_file, 'wb') as f: pickle.dump(split_dict, f) #protocol=pickle.HIGHEST_PROTOCOL) else: print('Loading stratified training/validation split.') with open(validation_split_file, 'rb') as f: split_dict = pickle.load(f) indices = split_dict[split] self.X = X[indices] #subsetting the split self.y = y[indices] else: raise ValueError('Provided split not available.', 'Use any of [training, validation, testing]')
def CrossValidate_LR(X, y, config, output_layer_size): kf = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) kf.get_n_splits(X, y) config["loss"] = "ce" config["seed"] = 1234 config.update({ "input_layer_size": X.shape[1], "output_layer_size": output_layer_size }) lr = [1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001] opt = ["adam", "nag", "momentum", "gd"] # lr = [0.1] # opt = ["nag"] f = open('LR_ModelSelection.txt', 'w') f.write('lr\tOptimizer\tCV\tAccuracy\tPrecision\tRecall\tF1\n') for l in lr: for o in opt: config["lr"] = l config["opt"] = o i = 0 for train_index, val_index in kf.split(X, y): print('lr {}, CV={}...............\n'.format(l, i)) X_train, X_val = X[train_index], X[val_index] y_train, y_val = y[train_index], y[val_index] # Configuring the neural network with the hyperparameters for Logistic Regression LR = LogisticRegression(config) i += 1 # Train, validate and test Accuracy, Precision, Recall, F1 = LR.Train_LR( X_train, y_train, X_val, y_val) f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( l, o, i, Accuracy, Precision, Recall, F1)) f.close()
def stratified_split(self, X, y, n_splits=10, test_size=0.2): """ Sklearn function to do stratified splitting of the input @param: X List of text vlaues for train/test @param: y List of expected labels for train/test @param: n_splits Number of splits to return the data in (default 10) @param: test_size Size of data to hold for testing purposes (default 0.2) return List of dictionaries separated by keys train, test """ skf = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=42) skf.get_n_splits(X, y) splits = [] for train_index, test_index in skf.split(X, y): X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index] y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index] # add augmentation code here splits.append({"train": {"X": X_train, "y": y_train}, "test": {"X": X_test, "y": y_test}}) return splits
def Check_model_on_random_splits( X, y, supp, n_splits, verbose, stats_model=True # , gain_charts = False ): X_small = X[supp] sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=.5, random_state=1234) sss.get_n_splits(X_small, y) out_arr = [] for train_index, test_index in sss.split(X_small, y): X_train, X_test = X_small.iloc[train_index], X_small.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] out = run_WOE(X_train, X_test, y_train, y_test, max_bin=5, keep_all=True) X_WOE_tranformed_train, X_WOE_tranformed_test, WOE, WOE_concise = out[ 0], out[1], out[2], out[3] try: out = Model_on_vars( supp_var=supp, X_WOE_tranformed_train=X_WOE_tranformed_train, X_WOE_tranformed_test=X_WOE_tranformed_test, y_train=y_train, y_test=y_test, verbose=verbose, stats_model=stats_model # , gain_charts = gain_charts ) out_arr.append(out) except: print('whoops') return (out_arr)
def make_dataset(full_path, seed): full_file_names = get_file_names(full_path) X = [] X_flattened = [] X_flattened_6 = [] y_names = [] y_numbers = [] for folder_number in range(9): for filename in range(len(full_file_names[folder_number])): path = full_file_names[folder_number][filename] sample_data = genfromtxt(path, delimiter=',') X.append(sample_data) l = [x[0:6] for x in sample_data] X_flattened_6.append(list(chain.from_iterable(l))) X_flattened.append(list(chain.from_iterable(sample_data))) label = path.split('/') label = label[len(label) - 1].split('.')[0] label = label[0:len(label) - 1] y_names.append(label) y_numbers.append(conversion(label)) X_to_use = X_flattened y_numbers = conversion_one_hot(y_numbers) sss = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=seed) sss.get_n_splits(X_to_use, y_numbers) for train_index, test_index in sss.split(X_to_use, y_numbers): train_X, test_X = np.array(X_to_use, dtype=np.float32)[train_index], np.array( X_to_use, dtype=np.float32)[test_index] train_y, test_y = np.array(y_numbers, dtype=np.float32)[train_index], np.array( y_numbers, dtype=np.float32)[test_index] ssss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=seed) ssss.get_n_splits(test_X, test_y) for test_index, validate_index in ssss.split(test_X, test_y): test_X, validate_X = test_X[test_index], test_X[validate_index] test_y, validate_y = test_y[test_index], test_y[validate_index] print("train size: ", len(train_X)) print("validate size: ", len(validate_X)) print("test_ size: ", len(test_X)) return train_X, validate_X, test_X, train_y, validate_y, test_y
def test_imb_performance(): from maatpy.dataset import simulate_dataset from sklearn.metrics import cohen_kappa_score from sklearn.model_selection import StratifiedShuffleSplit imb = simulate_dataset(n_samples=100, n_features=2, n_classes=2, weights=[0.9, 0.1], random_state=0) sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=0) sss.get_n_splits(imb.data, imb.target) for train_index, test_index in sss.split(imb.data, imb.target): X_train, X_test = imb.data[train_index], imb.data[test_index] y_train, y_test = imb.target[train_index], imb.target[test_index] adaboost = AdaBoostClassifier(random_state=0) adaboost.fit(X_train, y_train) adaboost_score = cohen_kappa_score(adaboost.predict(X_test), y_test) clf = SMOTEBoost(random_state=0) clf.fit(X_train, y_train) score = cohen_kappa_score(clf.predict(X_test), y_test) assert score >= adaboost_score, "Failed with score = %f; AdaBoostClassifier score= %f" % (score, adaboost_score)
def gen_sample_array(self): try: from sklearn.model_selection import StratifiedShuffleSplit except: print('Need scikit-learn for this functionality') import numpy as np cn = [] ad = [] emci = [] lmci = [] s = StratifiedShuffleSplit(n_splits=self.n_splits, test_size=0.25, random_state=19) X = torch.randn(self.class_vector.size(0), 4).numpy() y = self.class_vector.numpy() s.get_n_splits(X, y) train_index, test_index = next(s.split(X, y)) indices = np.hstack([train_index, test_index]) for i in indices: if y[indices[i]] == 0: cn.append(indices[i]) elif y[indices[i]] == 1: ad.append(indices[i]) elif y[indices[i]] == 2: emci.append(indices[i]) else: lmci.append(indices[i]) new_indices = [] for i in range(s.get_n_splits(X, y)): new_indices.append(cn[i]) new_indices.append(cn[i]) new_indices.append(ad[i]) new_indices.append(ad[i]) new_indices.append(emci[i]) new_indices.append(emci[i]) new_indices.append(lmci[i]) new_indices.append(lmci[i]) return new_indices
def split_data(client, data, n=1): """ Specifications: Splits data into n stratified samples (1 by default) Args: client (TYPE): ... data (dict): {data:label} n (int, optional): number of splits Yields: tuple of 2 dicts: train and test dictionary """ data_points, labels = zip(*data.items()) #Dict to enumerate labels enumeration = client.collection("meta_data1").document( "s2i").get().to_dict() labels = [enumeration[label] for label in labels] sss = StratifiedShuffleSplit(n_splits=n, test_size=0.3, random_state=0) sss.get_n_splits(data_points, labels) for train_index, test_index in sss.split(data_points, labels): train_data = [] train_labels = [] test_data = [] test_labels = [] for x in train_index: train_data.append(data_points[x]) train_labels.append(labels[x]) for y in test_index: test_data.append(data_points[y]) test_labels.append(labels[y]) data_train = dict(zip(train_data, train_labels)) data_test = dict(zip(test_data, test_labels)) yield data_train, data_test
def train_and_cross_validate(sizes, num_hidden=8, n_epochs=50000, eta=0.01): X_train, X_test, y_train, y_test, X, Y = prepare_data() get_feature_importance(X, Y) nn = NN(sizes) sss = StratifiedShuffleSplit(n_splits=3, test_size=0.3, random_state=0) sss.get_n_splits(X, Y) j = 0 for train_index, test_index in sss.split(X, Y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] X_train = np.array(X_train, dtype=np.float32) y_train = np.array(y_train, dtype=np.float32) train(nn, X_train, y_train, X_test, y_test, j) test(nn, X_test, y_test) np.savetxt("nn_network_weights.txt", nn._W) np.savetxt("nn_network_biases.txt", nn._b) j = j + 1
def take_stratified_split(_df, target, n_splits=10, valid_size=None, test_idx=None): df = _df.copy() if test_idx is not None: df_test = df.iloc[test_idx].copy() df = df[~df.index.isin(test_idx)] y = df[target] seed = 1234 if n_splits == 1: # take train valid split assert valid_size is not None sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=valid_size, random_state=seed) sss.get_n_splits(df, y) train_index, valid_index = sss.split(df, y).__next__() df.loc[train_index, 'Fold'] = 0 # fold 0 is train data df.loc[valid_index, 'Fold'] = 1 # fold 1 is validation data if test_idx is not None: df_test['Fold'] = 2 # fold 2 is test data df = pd.concat([df, df_test], ignore_index=True) else: assert valid_size is None skf = StratifiedKFold(n_splits=n_splits, random_state=seed, shuffle=True) for i, (_, valid_index) in enumerate(skf.split(df, y)): df.loc[valid_index, 'Fold'] = i if test_idx is not None: df_test[ 'Fold'] = i + 1 # a fold which has max value is corresponding to test data df = pd.concat([df, df_test], ignore_index=True) df.Fold = df.Fold.astype(int) return df
def crossValPrediction(otu_use, y, max_depth=10, n_estimators=65, weight=5, plot=False, plot_pr=False, folds=5): kf = StratifiedShuffleSplit(n_splits=folds) kf.get_n_splits(otu_use, y) auc_crossVal = [] auc_prec_crossVal = [] f1_crossVal = [] feat_imp_crossVal = [] i = 0 for train_index, val_index in kf.split(otu_use, y): otu_train = otu_use.iloc[train_index, :] otu_val = otu_use.iloc[val_index, :] y_train = np.array(y)[train_index] y_val = np.array(y)[val_index] plt.subplot(1, 2, 1) m, auc, auc_train, fpr, tpr, prec, f1, f2, feat_imp = predictIBD( otu_train, y_train, otu_val, y_val, max_depth=max_depth, n_estimators=n_estimators, weight=weight, plot=plot, plot_pr=plot_pr, feat_imp=True) auc_crossVal.append(auc) auc_prec_crossVal.append(prec) f1_crossVal.append(f1) feat_imp_crossVal.append(feat_imp) i = i + 1 return (auc_crossVal, auc_prec_crossVal, f1_crossVal, feat_imp_crossVal)
def execute(self, params, **kwargs): from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, cross_val_score, GridSearchCV train_no_na = self.marvin_initial_dataset['train'][ params["pred_cols"] + [params["dep_var"]]].dropna() print("Length: {}".format(len(train_no_na))) # Feature Engineering data_X = train_no_na[params["pred_cols"]] data_X.loc[:, 'Sex'] = data_X.loc[:, 'Sex'].map({ 'male': 1, 'female': 0 }) data_y = train_no_na[params["dep_var"]] # Prepare for Stratified Shuffle Split sss = StratifiedShuffleSplit(n_splits=5, test_size=.6, random_state=0) sss.get_n_splits(data_X, data_y) # Get Test Dataset test_no_na = self.marvin_initial_dataset['test'][ params["pred_cols"]].dropna() print("Length: {}".format(len(test_no_na))) # Feature Engineering test_X = test_no_na[params["pred_cols"]] test_X.loc[:, 'Sex'] = test_X.loc[:, 'Sex'].map({ 'male': 1, 'female': 0 }) self.marvin_dataset = { 'X_train': data_X, 'y_train': data_y, 'X_test': test_X, 'sss': sss } print("Preparation is Done!!!!")
def find_best(classifier, parameters): estimators = [('select',SelectKBest()), ('clf',classifier)] pipe = Pipeline(estimators) #pp.pprint(sorted(pipe.get_params().keys())) sss = StratifiedShuffleSplit(n_splits=3, test_size=0.9, random_state=42) sss.get_n_splits(features,labels) result = GridSearchCV(pipe, parameters, cv = sss) result.fit(features, labels) clf = result.best_estimator_ print result.best_params_ my_features_list = [features_list[i+1] for i in clf.named_steps['select'].get_support(indices=True)] my_features_list.insert(0, "poi") print my_features_list dump_classifier_and_data(clf, my_dataset, my_features_list) test_classifier(clf, my_dataset, my_features_list)
def train_test_split(X, y, rnd_seed): """ split the features and the labels according to the indices :param X: feature set, should be array or list :param y: labels, should be array or list :param rnd_seed: random seed """ # generate indices for the train and test set indices = [i for i in range(len(y))] sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=rnd_seed) sss.get_n_splits(indices, y) train_indices, test_indices = next(sss.split(indices, y)) # train/test split X_train = [X[i] for i in train_indices] X_test = [X[i] for i in test_indices] y_train = [y[i] for i in train_indices] y_test = [y[i] for i in test_indices] return X_train, X_test, y_train, y_test
def get_partition(self): df = self.get_binarized_data() ids = df.index labels = df.values.flatten() sss = StratifiedShuffleSplit(n_splits=1, test_size= self.config.val_size) sss.get_n_splits(ids, labels) for train_index, test_index in sss.split(ids, labels): ids_train, ids_val = ids[train_index], ids[test_index] y_train, y_val = labels[train_index], labels[test_index] test_data = pd.read_table('MICCAI_Test.txt', index_col = 0, delim_whitespace = True, header = 0) ids_test = test_data.index y_test = test_data.apply(self.le.fit_transform).values.flatten() partition_ids = {'train': list(ids_train), 'val': list(ids_val), 'test': list(ids_test)} partition_labels = {'train': list(y_train), 'val': list(y_val), 'test': list(y_test)} return partition_ids, partition_labels
def StraitKFold(classifier, X, y): skf = StratifiedShuffleSplit(n_splits = 3) skf.get_n_splits(X, y) y_scores = pd.DataFrame() y_tests = pd.DataFrame() y_pred = pd.DataFrame() f1 = np.array([]) n = 0 for train_index, test_index in skf.split(X, y): classifier.fit(X.iloc[train_index, :], y.iloc[train_index, 0]) y_scores['fold_'+str(n)] = classifier.decision_function(X.iloc[test_index, :]) y_pred['fold_'+str(n)] = classifier.predict(X.iloc[test_index, :]) y_tests['fold_'+str(n)] = y.iloc[test_index, 0].values f1 = np.append(f1, metrics.f1_score(y.iloc[test_index, 0], y_pred.iloc[:,n])) accuracy = np.append(f1, metrics.accuracy_score(y.iloc[test_index, 0], y_pred.iloc[:,n])) n += 1 f1_score = np.mean(f1) accuracy = np.mean(accuracy) print('mean accuracy score: '+str(accuracy)) print('mean f1 score: '+str(f1_score)) return y_scores, y_tests, accuracy, f1_score
'low_interest_manager_id', 'low_interest_building_id', 'low_interest_display_address', 'n_listings_of_manager'] features.extend(common_managers) print(f'number of features: {len(features)}') X = sparse.hstack([train[features], train_ft_tfidf_transformed]).tocsr() train['interest_level'] = train.interest_level.apply(lambda x: set_int_for_category(x)) y = train['interest_level'] sss = StratifiedShuffleSplit(n_splits = 2, test_size=0.35, random_state=0) sss.get_n_splits(X=X, y=y) for train_index, test_index in sss.split(X, y): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_validate = X[train_index], X[test_index] y_train, y_validate = y.iloc[train_index], y.iloc[test_index] # create dataset for lightgbm lgb_train = lgb.Dataset(X_train, label=y_train) lgb_eval = lgb.Dataset(X_validate, label=y_validate, reference=lgb_train) print('data set has been setup') # specify your configurations as a dict params = { 'task': 'train',
w_featdim = random.choice((64, 128, 256)), w_featdrop = random.choice((0.1, 0.2, 0.5)), rnn = random.choice(('GRU', 'LSTM'))) return o, hash(str(o)) def __str__(self): str = "" for attr in self.__slots__: str += '{}={}, '.format(attr, getattr(self, attr)) return str[:-2] data = load(opt.input_prefix) nepoch = 40 ssp = StratifiedShuffleSplit(n_splits=1, test_size=0.2) ssp.get_n_splits(data.docs, data.labels) trn_idx, dev_idx = list(ssp.split(data.docs, data.labels))[0] trn_labels = to_categorical(np.array(data.labels)[trn_idx]) dev_labels = to_categorical(np.array(data.labels)[dev_idx]) search_iter = 1000 search_done = set() for _ in range(search_iter): o, h = Options.sample() if h in search_done: continue search_done.add(h) if not o.c_maxlen: o.c_maxlen = np.max(data.len_char) c_vocab = Counter({k:v for k,v in data.chars.items() if v > o.c_cutoff})
from sklearn.ensemble import RandomForestClassifier m = RandomForestClassifier(class_weight=opt.class_weight,n_estimators=300,random_state=seed) else: from sklearn.svm import LinearSVC m = LinearSVC(dual=True, C=opt.C, verbose=0, class_weight=opt.class_weight) if opt.mult_class == 'ovo': mc = OneVsOneClassifier elif opt.mult_class == 'ovr': mc = OneVsRestClassifier if opt.classifier != 'rf': m = mc(m, n_jobs=opt.n_jobs) ssp = StratifiedShuffleSplit(n_splits=1, test_size=0.2) ssp.get_n_splits(docs, labels) trn_idx, dev_idx = list(ssp.split(data.docs, data.labels))[0] acc = [] f1M = [] train_docs = docs[trn_idx] train_labels = labels[trn_idx] dev_docs = docs[dev_idx] dev_labels = labels[dev_idx] split_size = round(len(trn_idx)/10) for i in range(10): info("training up to {}".format((i+1)*split_size)) m.fit(train_docs[0:(i+1)*split_size], train_labels[0:(i+1)*split_size]) pred = m.predict(dev_docs) acc.append(accuracy_score(dev_labels, pred))