def fit_models(imps, X, Y, all_props, props=None, labels=None, n_splits=5, clf_args={'n_estimators':25, 'max_features':'auto', 'random_state':0}): if props is None: props = all_props n_obs = X['missing'].shape[0] # Number of observations. n_features = X['missing'].shape[1] # Number of observations. n_props = len(props) # Number of properties to predict. test_size = 0.2 if labels is None: shuffle_split = ShuffleSplit(n_iter=n_splits, test_size=test_size,random_state=0) else: shuffle_split = GroupShuffleSplit(n_iter=n_splits, test_size=test_size,random_state=0) n_test_samples = np.max([len(list(shuffle_split.split(range(n_obs),groups=labels))[i][1]) \ for i in range(n_splits)]) rs = {imp:np.ma.zeros((n_props,n_splits)) for imp in imps} ps = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps} ys = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps} feature_importances = {imp:np.ma.zeros((n_props,n_features,n_splits)) for imp in imps} for n_prop,prop in enumerate(props): j = all_props.index(prop) print("Fitting model for %s..." % prop) for imp in imps: for k,(train,test) in enumerate(shuffle_split.split(range(n_obs), groups=labels)): X_train,X_test = X[imp][train],X[imp][test] Y_train,Y_test = Y[imp][train,j],Y['missing'][test,j] clf_args_ = {key:(value if type(value) is not dict \ else value[prop])\ for key,value in clf_args.items()} if clf_args_['max_features'] not in [None, 'auto']: clf_args_['max_features'] = min(X_train.shape[1], clf_args_['max_features']) rfc = RandomForestClassifier(**clf_args_) #if Y_train.shape[1] == 1: # Y_train = Y_train.ravel() rfc.fit(X_train,Y_train) Y_predict = rfc.predict(X_test)#.reshape(-1,n_props) probs = rfc.predict_proba(X_test) if probs.shape[1]<2 and probs.mean()==1.0: n_test_samples = len(probs) ps[imp][n_prop,k,:n_test_samples] = 0.0 else: n_test_samples = len(probs[:,1]) ps[imp][n_prop,k,:n_test_samples] = probs[:,1] ys[imp][n_prop,k,:n_test_samples] = Y_test rs[imp][n_prop,k] = np.ma.corrcoef(Y_predict,Y_test)[0,1] feature_importances[imp][n_prop,:,k] = rfc.feature_importances_ return rs,feature_importances,ys,ps
def FitModel(cnnc, A, Y, T, FN): print('Fitting model...') ss = ShuffleSplit(n_splits = 1) trn, tst = next(ss.split(A)) #Fit the network cnnc.fit(A[trn], Y[trn]) #The predictions as sequences of character indices YH = [] for i in np.array_split(np.arange(A.shape[0]), 32): YH.append(cnnc.predict(A[i])) YH = np.vstack(YH) #Convert from sequence of char indices to strings PS = np.array([''.join(YHi) for YHi in YH]) #Compute the accuracy S1 = SAcc(PS[trn], T[trn]) S2 = SAcc(PS[tst], T[tst]) print('Train: ' + str(S1)) print('Test: ' + str(S2)) for PSi, Ti, FNi in zip(PS, T, FN): if np.random.rand() > 0.99: #Randomly select rows to print print(FNi + ': ' + Ti + ' -> ' + PSi) print('Fitting with CV data...') #Fit remainder cnnc.SetMaxIter(4) cnnc.fit(A, Y) return cnnc
def main(): from io import open as uopen import argparse parser = argparse.ArgumentParser() parser.add_argument('fname') parser.add_argument('idx', default=2, type=int) parser.add_argument('--key', default=u'V;1;SG;IND;PST;PFV') parser.add_argument('--shuffle', action='store_true') parser.add_argument('--folds', default=10, type=int) parser.add_argument('--lang', default='sp') parser.add_argument('--key-idx', default=3, type=int) args = parser.parse_args() fh = uopen(args.fname, encoding='utf-8') lines = [x.strip().split(u'\t') for x in fh] to_extract = [(x[0], x[args.idx]) for x in lines if x[args.key_idx] == args.key] if args.shuffle: from random import shuffle shuffle(to_extract) from distutils.dir_util import mkpath from sklearn.model_selection import ShuffleSplit rs = ShuffleSplit(n_splits=args.folds, test_size=0.2, random_state=42) for i, (train_indices, test_indices) in enumerate(rs.split(to_extract)): mkpath('res/ryan_splits/{}-10fold/{}'.format(args.lang, i)) train_fh, dev_fh, test_fh = (uopen('res/ryan_splits/{}-10fold/{}/train.uniq'.format(args.lang, i), mode='w', encoding='utf-8'), uopen('res/ryan_splits/{}-10fold/{}/dev.uniq'.format(args.lang, i), mode='w', encoding='utf-8'), uopen('res/ryan_splits/{}-10fold/{}/test.uniq'.format(args.lang, i), mode='w', encoding='utf-8'), ) for idx in train_indices: train_fh.write(u'{}\t{}\n'.format(to_extract[idx][0], to_extract[idx][1])) for j, idx in enumerate(test_indices): if j % 2 == 0: dev_fh.write(u'{}\t{}\n'.format(to_extract[idx][0], to_extract[idx][1])) else: test_fh.write(u'{}\t{}\n'.format(to_extract[idx][0], to_extract[idx][1]))
def run(): # Load data set X_train, Y_train, X_test, submission_file_content = load_data() Y_train = np.log(Y_train + 200) # Initiate model model = init_model(X_train.shape[1]) vanilla_weights = model.get_weights() # Cross validation cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0) for cross_validation_index, (train_index, valid_index) in enumerate( cross_validation_iterator.split(X_train), start=1 ): print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM)) optimal_weights_path = "/tmp/Optimal_Weights_{}.h5".format(cross_validation_index) submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index)) if os.path.isfile(submission_file_path): continue if not os.path.isfile(optimal_weights_path): # Load the vanilla weights model.set_weights(vanilla_weights) # Perform the training procedure earlystopping_callback = EarlyStopping(monitor="val_actual_mae", patience=EARLYSTOPPING_PATIENCE) modelcheckpoint_callback = ModelCheckpoint(optimal_weights_path, monitor="val_loss", save_best_only=True) model.fit( X_train[train_index], Y_train[train_index], batch_size=TRAIN_BATCH_SIZE, nb_epoch=MAXIMUM_EPOCH_NUM, validation_data=(X_train[valid_index], Y_train[valid_index]), callbacks=[earlystopping_callback, modelcheckpoint_callback], verbose=2, ) # Load the optimal weights model.load_weights(optimal_weights_path) # Perform the testing procedure Y_test = model.predict(X_test, batch_size=TEST_BATCH_SIZE, verbose=2) # Save submission to disk if not os.path.isdir(SUBMISSION_FOLDER_PATH): os.makedirs(SUBMISSION_FOLDER_PATH) submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200 submission_file_content.to_csv(submission_file_path, index=False) # Perform ensembling ensemble_predictions() print("All done!")
def train_model(clf, X, Y, name="NB ngram", plot=False): # create it again for plotting # cv = ShuffleSplit( # n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) cv = ShuffleSplit( n_splits=10, test_size=0.3, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = [] precisions, recalls, thresholds = [], [], [] clfs = [] # just to later get the median for train, test in cv.split(X): X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, 1]) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) if plot: scores_to_sort = pr_scores median = np.argsort(scores_to_sort)[int(len(scores_to_sort) / 2)] plot_pr(pr_scores[median], name, phase, precisions[median], recalls[median], label=name) log_false_positives(clfs[median], X_test, y_test, name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) return np.mean(train_errors), np.mean(test_errors)
def train_model(clf_factory, X, Y, name="NB ngram", plot=False): # cv = ShuffleSplit( # n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) # http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html # old:http://scikit-learn.org/0.15/modules/generated/sklearn # .cross_validation.ShuffleSplit.html#sklearn.cross_validation.ShuffleSplit cv = ShuffleSplit( n_splits=10, test_size=0.3, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = [] precisions, recalls, thresholds = [], [], [] for train, test in cv.split(X): X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf = clf_factory() clf.fit(X_train, y_train) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) # print('proba:', proba) # fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, 1]) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) scores_to_sort = pr_scores # print('np.argsort(scores_to_sort):', np.argsort(scores_to_sort),len(scores_to_sort) / 2) median = np.argsort(scores_to_sort)[int(len(scores_to_sort) / 2)] if plot: plot_pr(pr_scores[median], name, "01", precisions[median], recalls[median], label=name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) return np.mean(train_errors), np.mean(test_errors)
def fit_models_mc(imps, X, Y, all_props, props=None, labels=None, n_splits=5, clf_args={'n_estimators':25, 'max_features':'auto', 'random_state':0}): if props is None: props = all_props n_obs = X['missing'].shape[0] # Number of observations. n_features = X['missing'].shape[1] # Number of observations. n_props = len(props) # Number of properties to predict. test_size = 0.2 if labels is None: shuffle_split = ShuffleSplit(n_iter=n_splits, test_size=test_size,random_state=0) else: shuffle_split = LabelShuffleSplit(n_iter=n_splits, test_size=test_size,random_state=0) n_test_samples = np.max([len(list(shuffle_split)[i][1]) \ for i in range(n_splits)]) rs = {imp:np.ma.zeros((n_props,n_splits)) for imp in imps} ps = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps} ys = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps} feature_importances = None#{imp:np.ma.zeros((n_props,n_features,n_splits)) for imp in imps} cols = np.array([i for i in range(len(all_props)) if all_props[i] in props]) for imp in imps: for k,(train,test) in enumerate(shuffle_split.split(range(n_obs),groups=labels)): #X_train,X_test = X[imp][train][:,cols],X[imp][test][:,cols] #Y_train,Y_test = Y[imp][train][:,cols],Y['missing'][test][:,cols] X_train,X_test = X[imp][train,:],X[imp][test,:] Y_train,Y_test = Y[imp][train,:],Y['missing'][test,:] clf_args_ = {key:(value if type(value) is not dict \ else value[prop])\ for key,value in clf_args.items()} if clf_args_['max_features'] not in [None, 'auto']: clf_args_['max_features'] = min(X_train.shape[1], clf_args_['max_features']) rfc = RandomForestClassifier(**clf_args_) onevsrest = OneVsRestClassifier(rfc) onevsrest.fit(X_train,Y_train) Y_predict = onevsrest.predict(X_test)#.reshape(-1,n_props) probs = onevsrest.predict_proba(X_test) if probs.shape[1]<2 and probs.mean()==1.0: n_test_samples = len(probs) ps[imp][:,k,:n_test_samples] = 0.0 else: n_test_samples = len(probs[:,1]) ps[imp][:,k,:n_test_samples] = probs.T ys[imp][:,k,:n_test_samples] = Y_test.T for i in range(n_props): rs[imp][i,k] = np.ma.corrcoef(Y_predict[:,i],Y_test[:,i])[0,1] #feature_importances[imp][n_prop,:,k] = onevsrest.feature_importances_ return rs,feature_importances,ys,ps
def TestPerformance(self, df = None): #If no dataframe is provided, use the currently learned one if(df is None): D = self.D else: D = self.S.transform(df.copy()) #Get features from the data frame A = self._ExtractFeat(D) #Get the target values and their corresponding column names y, _ = self._ExtractTarg(D) #Begin cross validation ss = ShuffleSplit(n_splits = 1) for trn, tst in ss.split(A): s1 = self.R.score(A, y) s2 = self.R.score(A[tst], y[tst]) s3 = self.R.score(A[trn], y[trn]) print('C-V:\t' + str(s1) + '\nTst:\t' + str(s2) + '\nTrn:\t' + str(s3))
def test_safe_split_with_precomputed_kernel(): clf = SVC() clfp = SVC(kernel="precomputed") X, y = iris.data, iris.target K = np.dot(X, X.T) cv = ShuffleSplit(test_size=0.25, random_state=0) tr, te = list(cv.split(X))[0] X_tr, y_tr = _safe_split(clf, X, y, tr) K_tr, y_tr2 = _safe_split(clfp, K, y, tr) assert_array_almost_equal(K_tr, np.dot(X_tr, X_tr.T)) X_te, y_te = _safe_split(clf, X, y, te, tr) K_te, y_te2 = _safe_split(clfp, K, y, te, tr) assert_array_almost_equal(K_te, np.dot(X_te, X_tr.T))
def run(): # Load data set X_train, Y_train, X_test, submission_file_content = load_data() Y_train = np.log(Y_train + 200) # Cross validation cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0) for cross_validation_index, (train_index, valid_index) in enumerate(cross_validation_iterator.split(X_train), start=1): print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM)) submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index)) if os.path.isfile(submission_file_path): continue model = XGBRegressor( learning_rate=0.01, max_depth=12, n_estimators=N_ESTIMATORS, silent=False, objective="reg:linear", gamma=1, min_child_weight=1, subsample=0.8, colsample_bytree=0.5, reg_alpha=1, seed=cross_validation_index, nthread=-1) model.fit(X_train[train_index], Y_train[train_index], eval_set=[(X_train[valid_index], Y_train[valid_index])], eval_metric=lambda y_predicted, y_true:("actual_mae", mean_absolute_error(np.exp(y_true.get_label()), np.exp(y_predicted))), early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=True) # Perform the testing procedure Y_test = model.predict(X_test) # Save submission to disk if not os.path.isdir(SUBMISSION_FOLDER_PATH): os.makedirs(SUBMISSION_FOLDER_PATH) submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200 submission_file_content.to_csv(submission_file_path, index=False) # Perform ensembling ensemble_predictions() print("All done!")
def run(): # Load data set X_train, Y_train, X_test, submission_file_content = load_data() Y_train = np.log(Y_train + 200) # Cross validation cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0) for cross_validation_index, (train_index, valid_index) in enumerate(cross_validation_iterator.split(X_train), start=1): print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM)) submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index)) if os.path.isfile(submission_file_path): continue model = GBMRegressor( learning_rate=0.01, num_iterations=NUM_ITERATIONS, num_leaves=200, min_data_in_leaf=10, feature_fraction=0.3, feature_fraction_seed=cross_validation_index, bagging_fraction=0.8, bagging_freq=10, bagging_seed=cross_validation_index, metric="l1", metric_freq=10, early_stopping_round=EARLY_STOPPING_ROUND, num_threads=-1) model.fit(X_train[train_index], Y_train[train_index], test_data=[(X_train[valid_index], Y_train[valid_index])]) # Perform the testing procedure Y_test = model.predict(X_test) # Save submission to disk if not os.path.isdir(SUBMISSION_FOLDER_PATH): os.makedirs(SUBMISSION_FOLDER_PATH) submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200 submission_file_content.to_csv(submission_file_path, index=False) # Perform ensembling ensemble_predictions() print("All done!")
def plot_shuffle_split(): from sklearn.model_selection import ShuffleSplit plt.figure(figsize=(10, 2)) plt.title("ShuffleSplit with 10 points" ", train_size=5, test_size=2, n_splits=4") axes = plt.gca() axes.set_frame_on(False) n_folds = 10 n_samples = 10 n_iter = 4 n_samples_per_fold = 1 ss = ShuffleSplit(n_splits=4, train_size=5, test_size=2, random_state=43) mask = np.zeros((n_iter, n_samples)) for i, (train, test) in enumerate(ss.split(range(10))): mask[i, train] = 1 mask[i, test] = 2 for i in range(n_folds): # test is grey colors = ["grey" if x == 2 else "white" for x in mask[:, i]] # not selected has no hatch boxes = axes.barh(bottom=range(n_iter), width=[1 - 0.1] * n_iter, left=i * n_samples_per_fold, height=.6, color=colors, hatch="//", edgecolor='k', align='edge') for j in np.where(mask[:, i] == 0)[0]: boxes[j].set_hatch("") axes.invert_yaxis() axes.set_xlim(0, n_samples + 1) axes.set_ylabel("CV iterations") axes.set_xlabel("Data points") axes.set_xticks(np.arange(n_samples) + .5) axes.set_xticklabels(np.arange(1, n_samples + 1)) axes.set_yticks(np.arange(n_iter) + .3) axes.set_yticklabels(["Split %d" % x for x in range(1, n_iter + 1)]) # legend hacked for this random state plt.legend([boxes[1], boxes[0], boxes[2]], [ "Training set", "Test set", "Not selected"], loc=(1, .3)) plt.tight_layout()
def test_safe_split_with_precomputed_kernel(): clf = SVC() clfp = SVC(kernel="precomputed") iris = datasets.load_iris() X, y = iris.data, iris.target K = np.dot(X, X.T) cv = ShuffleSplit(test_size=0.25, random_state=0) train, test = list(cv.split(X))[0] X_train, y_train = _safe_split(clf, X, y, train) K_train, y_train2 = _safe_split(clfp, K, y, train) assert_array_almost_equal(K_train, np.dot(X_train, X_train.T)) assert_array_almost_equal(y_train, y_train2) X_test, y_test = _safe_split(clf, X, y, test, train) K_test, y_test2 = _safe_split(clfp, K, y, test, train) assert_array_almost_equal(K_test, np.dot(X_test, X_train.T)) assert_array_almost_equal(y_test, y_test2)
#k-fold validation # k-fold is a type of cross validation where the data are divided into k bins. For each experiment, pick one of the k bins as the test set, #the remaining k-1 bins as training. Run k separate experiments and average all k test results. #This technique helps to test different part of the data to prevent overfitting #i.e. it prevents grid search from returning a parameter set that optimized specifically for a specific training data set but not overall. from sklearn.model_selection import KFold cv_set = KFold(n_splits=10) for train_index, test_index in cv_sets.split(X): print("%s %s" % (train_index, test_index)) #Shufflesplit #ShuffleSplit() for an alternative form of cross-validation (see the 'cv_sets' variable). #The ShuffleSplit() will create 10 ('n_splits') shuffled sets, and for each shuffle, 20% ('test_size') of the data will be used as the validation set. from sklearn.model_selection import ShuffleSplit cv_sets = ShuffleSplit(n_splits = 10, test_size = 0.20, random_state = 0) for train_index, test_index in cv_sets.split(X): print("%s %s" % (train_index, test_index)) from sklearn.metrics import fbeta_score from sklearn.metrics import accuracy_score # pipelining #Sequentially apply a list of transforms and a final estimator. Intermediate steps #of the pipeline must be ‘transforms’, that is, they must implement fit and #transform methods. The final estimator only needs to implement fit. #The purpose of the pipeline is to assemble several steps that can be #cross-validated together while setting different parameters. from sklearn import svm from sklearn.datasets import samples_generator from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_regression
def train(working, max_samples, duration, rate, batch_size, epochs, epoch_size, validation_size, early_stopping, reduce_lr, seed): ''' Parameters ---------- working : str directory that contains the experiment data (h5) max_samples : int Maximum number of samples per streamer duration : float Duration of training patches batch_size : int Size of batches rate : int Poisson rate for pescador epochs : int Maximum number of epoch epoch_size : int Number of batches per epoch validation_size : int Number of validation batches early_stopping : int Number of epochs before early stopping reduce_lr : int Number of epochs before reducing learning rate seed : int Random seed ''' # Load the pump with open(os.path.join(OUTPUT_PATH, 'pump.pkl'), 'rb') as fd: pump = pickle.load(fd) # Build the sampler sampler = make_sampler(max_samples, duration, pump, seed) # Build the model model, inputs, outputs = construct_model(pump) # Load the training data idx_train_ = pd.read_json('index_train.json') # Split the training data into train and validation splitter_tv = ShuffleSplit(n_splits=1, test_size=0.25, random_state=seed) train, val = next(splitter_tv.split(idx_train_)) idx_train = idx_train_.iloc[train] idx_val = idx_train_.iloc[val] gen_train = data_generator(working, idx_train['id'].values, sampler, epoch_size, augment=True, lam=rate, batch_size=batch_size, revive=True, random_state=seed) gen_train = keras_tuples(gen_train(), inputs=inputs, outputs=outputs) gen_val = data_generator(working, idx_val['id'].values, sampler, len(idx_val), augment=False, batch_size=batch_size, revive=True, random_state=seed) gen_val = keras_tuples(gen_val(), inputs=inputs, outputs=outputs) loss = {'beat': 'binary_crossentropy', 'downbeat': 'binary_crossentropy'} metrics = {'beat': 'accuracy', 'downbeat': 'accuracy'} monitor = 'val_loss' model.compile(K.optimizers.Adam(), loss=loss, metrics=metrics) # Store the model model_spec = K.utils.serialize_keras_object(model) with open(os.path.join(OUTPUT_PATH, 'model_spec.pkl'), 'wb') as fd: pickle.dump(model_spec, fd) # Construct the weight path weight_path = os.path.join(OUTPUT_PATH, 'model.h5') # Build the callbacks cb = [] cb.append(K.callbacks.ModelCheckpoint(weight_path, save_best_only=True, verbose=1, monitor=monitor)) cb.append(K.callbacks.ReduceLROnPlateau(patience=reduce_lr, verbose=1, monitor=monitor)) cb.append(K.callbacks.EarlyStopping(patience=early_stopping, verbose=1, monitor=monitor)) # Fit the model model.fit_generator(gen_train, epoch_size, epochs, validation_data=gen_val, validation_steps=validation_size, callbacks=cb)
from sklearn import datasets from sklearn import svm from sklearn.model_selection import ShuffleSplit iris = datasets.load_iris() X = iris.data y = iris.target iris_ss = ShuffleSplit(train_size=0.6, test_size=0.4, random_state=0) train_index, test_index = next(iris_ss.split(X)) X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] clf = svm.SVC() clf.fit(X_train, y_train) print(clf.score(X_test, y_test))
# Decoding in sensor space using a linear SVM from sklearn.svm import SVC # noqa from sklearn.model_selection import ShuffleSplit # noqa from mne.decoding import CSP # noqa n_components = 3 # pick some components svc = SVC(C=1, kernel='linear') csp = CSP(n_components=n_components, norm_trace=False) # Define a monte-carlo cross-validation generator (reduce variance): cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42) scores = [] epochs_data = epochs.get_data() for train_idx, test_idx in cv.split(labels): y_train, y_test = labels[train_idx], labels[test_idx] X_train = csp.fit_transform(epochs_data[train_idx], y_train) X_test = csp.transform(epochs_data[test_idx]) # fit classifier svc.fit(X_train, y_train) scores.append(svc.score(X_test, y_test)) # Printing the results class_balance = np.mean(labels == labels[0]) class_balance = max(class_balance, 1. - class_balance) print("Classification accuracy: %f / Chance level: %f" % (np.mean(scores), class_balance))
] else: num_stds_config = [num_stds] # only need to generate models once for all CVs all_models = model_all_training_graphs(train_sketches, train_targets, args['size']) num_cross_validation = 5 # kf = KFold(n_splits=num_cross_validation) kf = ShuffleSplit(n_splits=num_cross_validation, test_size=0.2, random_state=0) print "We will perform " + str( num_cross_validation) + "-fold cross validation..." for benign_train, benign_validate in kf.split(train_targets): benign_validate_sketches, benign_validate_names = train_sketches[ benign_validate], train_targets[benign_validate] kf_test_sketches = np.concatenate( (test_sketches, benign_validate_sketches), axis=0) kf_test_targets = np.concatenate((test_targets, benign_validate_names), axis=0) # Modeling (training) models = [] for index in benign_train: models.append(all_models[index]) print "We will attempt multiple cluster threshold configurations for the best results." print "Trying: mean/max distances with 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0 standard deviation(s)..." print "Best Configuration: "
proj=True, picks=picks, baseline=None, preload=True) epochs_train = epochs.copy().crop(tmin=1., tmax=2.) labels = epochs.events[:, -1] - 2 ############################################################################### # Classification with linear discrimant analysis # Define a monte-carlo cross-validation generator (reduce variance): scores = [] epochs_data = epochs.get_data() epochs_data_train = epochs_train.get_data() cv = ShuffleSplit(10, test_size=0.2, random_state=42) cv_split = cv.split(epochs_data_train) # Assemble a classifier lda = LinearDiscriminantAnalysis() csp = CSP(n_components=4, reg=None, log=True, norm_trace=False) # Use scikit-learn Pipeline with cross_val_score function clf = Pipeline([('CSP', csp), ('LDA', lda)]) scores = cross_val_score(clf, epochs_data_train, labels, cv=cv, n_jobs=1) # Printing the results class_balance = np.mean(labels == labels[0]) class_balance = max(class_balance, 1. - class_balance) print("Classification accuracy: %f / Chance level: %f" % (np.mean(scores), class_balance))
v1 = time.perf_counter() vecTime = v1 - v0 # prepare output folder print() print(f"Threshold: {threshold}; Eps: {eps}") outputFile = f"{outDir}/FLAST___t{threshold}__eps{eps}.csv" with open(outputFile, "w") as fileOut: fileOut.write( "fold,numFlakyTrainSet,numNonFlakyTrainSet,numFlakyTestSet,numNonFlakyTestSet,vecTime,trainTime,testTime,avgPredTime,f-measure,precision,recall,accuracy,tp,fp,fn,tn\n" ) kf = ShuffleSplit(n_splits=numKFold, test_size=testSize) successFold = 0 for kFold, (trnIdx, tstIdx) in enumerate( kf.split(dataPointsList, dataLabelsList)): # data points vectorization v0 = time.perf_counter() dataPointsFlaky, dataPointsNonFlaky = flast.getDataPointsInfo( projectBasePath, projectName) dataPoints = dataPointsFlaky + dataPointsNonFlaky Z = flast.flastVectorization(dataPoints, reduceDim=reduceDim, dim=dim, eps=eps) dataPointsList = np.array( [Z[i].toarray() for i in range(Z.shape[0])]) dataLabelsList = np.array([1] * len(dataPointsFlaky) + [0] * len(dataPointsNonFlaky)) v1 = time.perf_counter() vecTime = v1 - v0
# Read epochs (train will be done only between 1 and 2s) # Testing will be done with a running classifier epochs = Epochs(raw, events, event_id, tmin, tmax, proj=True, picks=picks, baseline=None, preload=True) epochs_train = epochs.copy().crop(tmin=1., tmax=2.) labels = epochs.events[:, -1] - 2 ############################################################################### # Classification with linear discrimant analysis # Define a monte-carlo cross-validation generator (reduce variance): scores = [] epochs_data = epochs.get_data() epochs_data_train = epochs_train.get_data() cv = ShuffleSplit(10, test_size=0.2, random_state=42) cv_split = cv.split(epochs_data_train) # Assemble a classifier lda = LinearDiscriminantAnalysis() csp = CSP(n_components=4, reg=None, log=True, norm_trace=False) # Use scikit-learn Pipeline with cross_val_score function clf = Pipeline([('CSP', csp), ('LDA', lda)]) scores = cross_val_score(clf, epochs_data_train, labels, cv=cv, n_jobs=1) # Printing the results class_balance = np.mean(labels == labels[0]) class_balance = max(class_balance, 1. - class_balance) print("Classification accuracy: %f / Chance level: %f" % (np.mean(scores), class_balance))
def test_shufflesplit_reproducible(): # Check that iterating twice on the ShuffleSplit gives the same # sequence of train-test when the random_state is given ss = ShuffleSplit(random_state=21) assert_array_equal(list(a for a, b in ss.split(X)), list(a for a, b in ss.split(X)))
def run_model(X, y, outdir, **params): outfile = outdir + 'output_{}.csv'.format(params['model']) if not os.path.isfile(outfile): with open(outfile, 'w') as f: for param in params: f.write('{},'.format(param)) f.write('Accuracy,AUPRC,AUROC,Fold\n') accuracies = [] auprcs = [] aurocs = [] prc_fig = plt.figure() prc_ax = prc_fig.add_subplot(1, 1, 1) roc_fig = plt.figure() roc_ax = roc_fig.add_subplot(1, 1, 1) # prepare validation splits n_splits = 5 test_size = 0.2 splitter = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed) fold = 1 import time start = time.time() for train, test in splitter.split(X): print time.time() - start X_trn = X[train] y_trn = y[train] X_val = X[test] y_val = y[test] classifier = build_classifier(**params) classifier.fit(X_trn, y_trn) accuracy, auprc, auroc = analyze(classifier, X_val, y_val, prc_ax, roc_ax, **params) accuracies.append(accuracy) auprcs.append(auprc) aurocs.append(auroc) with open(outfile, 'a') as f: for param in params: f.write('{},'.format(params[param])) f.write('{},{},{},{}\n'.format(accuracy, auprc, auroc, fold)) fold += 1 n_folds = len(accuracies) avg_accuracy = sum(accuracies) / n_folds avg_auprc = sum(auprcs) / n_folds avg_auroc = sum(aurocs) / n_folds # Write average values to output file with open(outfile, 'a') as f: for param in params: f.write('{},'.format(params[param])) f.write('{},{},{},AVG\n'.format( avg_accuracy, avg_auprc, avg_auroc, )) name = '' for param in params: if isinstance(params[param], float): name = name + '_' + '{:.0g}'.format(params[param]) else: name = name + '_' + str(params[param]) # PRC figure prc_ax.set_xlabel('Recall') prc_ax.set_ylabel('Precision') prc_ax.set_xlim([0.0, 1.0]) prc_ax.set_ylim([0.0, 1.05]) prc_ax.set_title('PRC{}'.format(name)) prc_fig.savefig(outdir + 'PRC{}'.format(name) + ".png") plt.close(prc_fig) # ROC figure roc_ax.set_xlabel('False Positive Rate') roc_ax.set_ylabel('True Positive Rate') roc_ax.set_xlim([0.0, 1.0]) roc_ax.set_ylim([0.0, 1.05]) roc_ax.set_title('ROC{}'.format(name)) roc_fig.savefig(outdir + 'ROC{}'.format(name) + ".png") plt.close(roc_fig) return classifier
def train_model(model, num_split, seed, X, Y, neural_network=0): """ Train input model and obtain averaged results. :param model: Input model for training. :param num_split: (int) Number of splits for averaging of performance metrics. :param seed: (int) Seed for random state. :param X: (numpy array) Training input. :param Y: (numpy array) Class label. :param neural_network: (bool) whether the model is a neural network model (keras) or conventional machine learning model (scikit-learn). :return: Trained model """ from sklearn.model_selection import ShuffleSplit from keras.utils import to_categorical import numpy as np import time shuffle = ShuffleSplit(n_splits=num_split, random_state=seed, test_size=0.2) accuracy, fitting_time = 0.0, 0.0 accuracy, precision, recall, f1 = 0.0, 0.0, 0.0, 0.0 i = 0 for train_idx, test_idx in shuffle.split(X): i += 1 print("== Split %s ==" % i) start = time.perf_counter() x_train, x_test, y_train, y_test = X[train_idx], X[test_idx], Y[ train_idx], Y[test_idx] ml = model if neural_network: y_train_categorical = to_categorical(y_train) ml.fit(x_train, y_train_categorical, epochs=1, batch_size=32, verbose=0) else: ml.fit(x_train, y_train) pred_train = ml.predict(x_train) pred_test = ml.predict(x_test) if neural_network: pred_train = pred_train.argmax(axis=1) pred_test = pred_test.argmax(axis=1) pred_train = np.around(np.ndarray.flatten(pred_train)) pred_test = np.around(np.ndarray.flatten(pred_test)) end = time.perf_counter() acc_, prec_, rec_, f1_ = performance_metrics(y_train, y_test, pred_train, pred_test) accuracy += float(acc_) / num_split precision += float(prec_) / num_split recall += float(rec_) / num_split f1 += float(f1_) / num_split fitting_time += (end - start) / num_split print("Fitting time", (end - start), "\n") print("===== Average results over %s splits =====" % num_split) print("Accuracy : %f" % accuracy) print("Precision:", precision) print("Recall:", recall) print("F1 score:", f1) print("Average time taken: %f" % fitting_time) print("==========================================") return ml
def get_cv(X, y): cv = ShuffleSplit(n_splits=3, test_size=0.2, random_state=57) return cv.split(X)
def train_test_split(*arrays, **options): """Extend sklearn.model_selection.train_test_slit to have group split. Parameters ---------- *arrays : sequence of indexables with same length / shape[0] Allowed inputs are lists, numpy arrays, scipy-sparse matrices or pandas dataframes. test_size : float, int or None, optional (default=None) If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the complement of the train size. If ``train_size`` is also None, it will be set to 0.25. train_size : float, int, or None, (default=None) If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. shuffle : None or str (default='simple') How to shuffle the data before splitting. None, no shuffle. For str, one of 'simple', 'stratified' and 'group', corresponding to `ShuffleSplit`, `StratifiedShuffleSplit` and `GroupShuffleSplit`, respectively. labels : array-like or None (default=None) Ignored if shuffle is None or 'simple'. When shuffle='stratified', this array is used as class labels. When shuffle='group', this array is used as groups. Returns ------- splitting : list, length=2 * len(arrays) List containing train-test split of inputs. """ n_arrays = len(arrays) if n_arrays == 0: raise ValueError("At least one array required as input") test_size = options.pop('test_size', None) train_size = options.pop('train_size', None) random_state = options.pop('random_state', None) shuffle = options.pop('shuffle', 'simple') labels = options.pop('labels', None) if options: raise TypeError("Invalid parameters passed: %s" % str(options)) arrays = indexable(*arrays) n_samples = _num_samples(arrays[0]) if shuffle == 'group': if labels is None: raise ValueError("When shuffle='group', " "labels should not be None!") labels = check_array(labels, ensure_2d=False, dtype=None) uniques = np.unique(labels) n_samples = uniques.size n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=0.25) shuffle_options = dict(test_size=n_test, train_size=n_train, random_state=random_state) if shuffle is None: if labels is not None: warnings.warn("The `labels` is ignored for " "shuffle being None!") train = np.arange(n_train) test = np.arange(n_train, n_train + n_test) elif shuffle == 'simple': if labels is not None: warnings.warn("The `labels` is not needed and therefore " "ignored for ShuffleSplit, as shuffle='simple'!") cv = ShuffleSplit(**shuffle_options) train, test = next(cv.split(X=arrays[0], y=None)) elif shuffle == 'stratified': cv = StratifiedShuffleSplit(**shuffle_options) train, test = next(cv.split(X=arrays[0], y=labels)) elif shuffle == 'group': cv = GroupShuffleSplit(**shuffle_options) train, test = next(cv.split(X=arrays[0], y=None, groups=labels)) else: raise ValueError("The argument `shuffle` only supports None, " "'simple', 'stratified' and 'group', but got `%s`!" % shuffle) return list(chain.from_iterable((safe_indexing(a, train), safe_indexing(a, test)) for a in arrays))
column_q = ['ct_dst_sport_ltm','tcprtt','dwin','ct_src_dport_ltm', 'ct_dst_src_ltm','ct_dst_ltm','smean','dmean','dtcpb',] # data_x_xgboost = pd.DataFrame(data2,columns=column_q) # data_x = pd.get_dummies(data_x_xgboost) data_x = pd.get_dummies(data2) data_two = pd.concat([data_x, data_ytwo], axis=1) data_five = pd.concat([data_x, data_yfive], axis=1) # scaler_2 = MinMaxScaler(feature_range=(0, 1)) #自动将dtype转换成float64 # data_two = scaler_2.fit_transform(data_two) # index_train = np.arange(175341) # np.random.shuffle(index_train) data_train = np.array(data_five.iloc[:175341, :]) from sklearn.model_selection import ShuffleSplit rs = ShuffleSplit(n_splits=1, test_size=0.3, random_state=1) for train_1,train_2 in rs.split(data_train): train_70 = data_train[train_1,:] train_30 = data_train[train_2,:] train_70_x = train_70[:,:-1] train_70_y = train_70[:,-1] train_30_x = train_30[:,:-1] train_30_y = train_30[:,-1] x_test = np.array(data_five.iloc[175341:, :-1]) y_test = np.array(data_five.iloc[175341:, -1]) # x_train = x_train[index_train] # y_train = y_train[index_train] scaler_2 = MinMaxScaler(feature_range=(0, 1)) #自动将dtype转换成float64 train_70_x = scaler_2.fit_transform(train_70_x) train_30_x = scaler_2.transform(train_30_x) x_test = scaler_2.transform(x_test)
dl_rec = DLRecommender(fm_decoder, n_components=50, batch_size=10, n_epochs=1, alpha=10e-8, learning_rate=.75, memory=mem, l1_ratio=0., random_state=0) dl_cv = GridSearchCV(dl_rec, param_grid={'alpha': np.logspace(-4, 0, 5)}, cv=KFold( shuffle=False, n_folds=3), error_score=-1000, n_jobs=15, refit=False, verbose=10) estimators = [dl_cv] scores = Parallel(n_jobs=1, verbose=10)( delayed(single_run)(X, y, estimator, train, test, estimator_idx, split_idx, output_dir=output_dir, ) for split_idx, (train, test) in enumerate( uniform_split.split(X, y)) for estimator_idx, estimator in enumerate(estimators))
species_key_df = df_all[['Species', 'Species_code']].drop_duplicates() # create arrays of required data X_columns = ['leaf length', 'leaf width', 'widest point', 'total veins'] y_columns = ['Species'] X = df_equal[X_columns].values y = df_equal[y_columns].values # parameters of the model n_neighbors = 10 weights = ['uniform', 'distance'] weight = weights[0] ss = ShuffleSplit(n_splits=10, test_size=0.1) for train_index, test_index in ss.split(X): # generate data from indices X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index] # fit the training data clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weight) clf.fit(X_train, y_train.ravel()) # predict the test data output = clf.predict(X_test) # report results score = clf.score(X_test, y_test) print("Score: {:.2%}".format(score))
def testStacking(self): svc_c = [.1, 1] svc_kernel = ['linear'] cv_outer = ShuffleSplit(n_splits=1, test_size=0.2, random_state=3) # cv_inner = ShuffleSplit(n_splits=1, test_size=0.2, random_state=3) #cv_outer = KFold(n_splits=3, random_state=3) cv_inner = KFold(n_splits=3, random_state=3) sources = [np.arange(0, self.surface.shape[1]),np.arange(self.surface.shape[1], self.surface.shape[1] + self.thickness.shape[1])] ################################################################################## # SET UP HYPERPIPES ################################################################################## # surface pipe surface_pipe = Hyperpipe('surface_pipe', optimizer='grid_search', metrics=['accuracy'], inner_cv=cv_inner, verbose=1) surface_pipe += PipelineElement.create('svc', {'C': svc_c, 'kernel': svc_kernel}) # use source filter to select data for stacked hyperpipes surface_pipe.filter_element = SourceFilter(sources[0]) # thickness pipe thickness_pipe = Hyperpipe('thickness_pipe', optimizer='grid_search', metrics=['accuracy'], inner_cv=cv_inner, verbose=1) thickness_pipe += PipelineElement.create('svc', {'C': svc_c, 'kernel': svc_kernel}) # use source filter to select data for stacked hyperpipes thickness_pipe.filter_element = SourceFilter(sources[1]) # Mother Pipe mother = Hyperpipe('mother', optimizer='grid_search', metrics=['accuracy'], inner_cv=cv_inner, outer_cv=cv_outer, eval_final_performance=True, verbose=1) mother += PipelineStacking('multiple_sources', [surface_pipe, thickness_pipe], voting=False) mother += PipelineElement.create('svc', {'C': svc_c, 'kernel': svc_kernel}) mother.fit(self.X, self.y) final_score_photon = mother.result_tree.get_best_config_performance_test_set(0).metrics['accuracy'] ################################################################################## # SKLEARN ################################################################################## for train1, test in cv_outer.split(self.X): X_train1 = self.X[train1] X_test = self.X[test] y_train1 = self.y[train1] y_test = self.y[test] results_outer = {'C': [], 'kernel': [], 'val1_score': []} done_source_optimization = False for c_outer in svc_c: for kernel_outer in svc_kernel: results_outer['C'].extend([c_outer]) results_outer['kernel'].extend([kernel_outer]) print('C Outer:', c_outer, 'Kernel Outer:', kernel_outer, '\n') results_val1 = [] for train2, val1 in cv_inner.split(X_train1): X_train2 = X_train1[train2] X_val1 = X_train1[val1] y_train2 = y_train1[train2] y_val1 = y_train1[val1] if done_source_optimization is not True: source_predictions_train2 = list() source_predictions_val1 = list() best_inner_config = [] for source in range(2): results_source = {'C': list(), 'kernel': list(), 'test_score': list(), 'test_predictions': list()} for c_inner in svc_c: for kernel_inner in svc_kernel: results_source['C'].append(c_inner) results_source['kernel'].append(kernel_inner) print('Source {} C:{} Kernel:{}\n'.format(source, c_inner, kernel_inner)) results_source_folds = list() for train3, val2 in cv_inner.split(X_train2): X_train3 = X_train2[train3][:, sources[source]] X_val2 = X_train2[val2][:, sources[source]] y_train3 = y_train2[train3] y_val2 = y_train2[val2] svc_source = SVC(kernel=kernel_inner, C=c_inner) svc_source.fit(X_train3, y_train3) results_source_folds.append(svc_source.score(X_val2, y_val2)) results_source['test_score'].append(np.mean(results_source_folds)) best_inner_config_id = np.argmax(results_source['test_score']) best_inner_config.append({'C': results_source['C'][best_inner_config_id], 'kernel': results_source['kernel'][best_inner_config_id]}) print('Optimum config for source {}: {}'.format(source, best_inner_config[-1])) print('Now fitting optimum source pipe...') svc_source_opt = SVC(C=best_inner_config[-1]['C'], kernel=best_inner_config[-1]['kernel']) svc_source_opt.fit(X_train2[:, sources[source]], y_train2) source_predictions_train2.append(svc_source_opt.predict(X_train2[:, sources[source]])) source_predictions_val1.append(svc_source_opt.predict(X_val1[:, sources[source]])) done_source_optimization = True else: print('Skipping optimization of sources') print('Now fit 2nd level classifier with C={} and kernel={}'.format(c_outer, kernel_outer)) svc_meta = SVC(C=c_outer, kernel=kernel_outer) svc_meta.fit(np.transpose(np.asarray(source_predictions_train2)), y_train2) results_val1.append(svc_meta.score(np.transpose(np.asarray(source_predictions_val1)), y_val1)) results_outer['val1_score'].append(np.mean(results_val1)) best_outer_config_id = np.argmax(results_outer['val1_score']) best_outer_config = {'C': results_outer['C'][best_outer_config_id], 'kernel': results_outer['kernel'][best_outer_config_id]} print('Optimum config for meta classifier: {}'.format(best_outer_config)) print('Now fitting optimum meta pipe...') print('...with source config for source 1: {} and source 2: {}'.format(best_inner_config[0], best_inner_config[1])) svc_meta_opt = SVC(C=best_outer_config['C'], kernel=best_outer_config['kernel']) svc_source_1_opt = SVC(C=best_inner_config[0]['C'], kernel=best_inner_config[0]['kernel']) svc_source_2_opt = SVC(C=best_inner_config[1]['C'], kernel=best_inner_config[1]['kernel']) svc_source_1_opt.fit(X_train1[:, sources[0]], y_train1) svc_source_2_opt.fit(X_train1[:, sources[1]], y_train1) pred_source1_train1 = svc_source_1_opt.predict(X_train1[:, sources[0]]) pred_source2_train1 = svc_source_2_opt.predict(X_train1[:, sources[1]]) svc_meta_opt.fit(np.transpose(np.asarray([pred_source1_train1, pred_source2_train1])), y_train1) # get test performance pred_source1_test = svc_source_1_opt.predict(X_test[:, sources[0]]) pred_source2_test = svc_source_2_opt.predict(X_test[:, sources[1]]) final_score = svc_meta_opt.score(np.transpose(np.asarray([pred_source1_test, pred_source2_test])), y_test) print('Final test performance: {}'.format(final_score)) self.assertEqual(final_score, final_score_photon)
print('Precision: \t{}'.format(metrics.precision_score(y_test, predicted, average=None))) print('Recall: \t{}'.format(metrics.recall_score(y_test, predicted, average=None))) print('F1: \t\t{}'.format(metrics.f1_score(y_test, predicted, average=None))) print('Macro Precision: \t{}'.format(metrics.precision_score(y_test, predicted, average='macro'))) print('Macro Recall: \t\t{}'.format(metrics.recall_score(y_test, predicted, average='macro'))) print('Macro F1: \t\t{}'.format(metrics.f1_score(y_test, predicted, average='macro'))) # stratified k-fold print('-------------------------------- Shuffle Split ---------------------------------') total_score = 0 runs = 0 from sklearn.model_selection import ShuffleSplit ss = ShuffleSplit(n_splits=10, test_size=0.2, random_state=10) for train, test in ss.split(tweets, target): X_train = np.array(tweets)[train] y_train = target[train] X_test = np.array(tweets)[test] y_test = target[test] pipeline = Pipeline([('vect', CountVectorizer(max_df=0.75, ngram_range=(1, 2))), ('tfidf', TfidfTransformer(norm='l1', use_idf=False)), ('clf', ExtraTreesClassifier(random_state=0, n_estimators=10, class_weight='auto'))]) pipeline = pipeline.fit(X_train, y_train) predicted = pipeline.predict(X_test) print('Accuracy: {}'.format(accuracy_score(y_test, predicted))) print(metrics.classification_report(y_test, predicted)) print('Macro Precision: \t{}'.format(metrics.precision_score(y_test, predicted, average='macro')))
def load_data(training=False): tmin, tmax = -1., 4.1 raw_edf = [] X = [] y = [] X_train =[] X_test = [] y_train =[] y_test =[] stim_code = dict([(32766,1),(769,2), (770,3), (771,5), (772,4),(783,6),(276,7),(277,8),(768,9), (1023,10),(1072,11)]) if training: path = op.join('data_i2r', 'BCI_IV_2a', 'TrainingSet') if not training: path = op.join('data_i2r', 'BCI_IV_2a', 'TestingSet') #directories = os.listdir(path) #for data_folder in directories: file_list = glob.glob(path + '/*.gdf') print file_list raw_files = [read_raw_edf(raw_fnames, preload=True, stim_channel='auto')for raw_fnames in file_list] raw_edf.extend(raw_files) #events = find_events(raw, shortest_event=0, stim_channel='STI 014') samplin_frequency =250; for edf_raw in raw_edf: event_id = dict() events = find_events(edf_raw, shortest_event=0, stim_channel='STI 014') events_from_edf = [] samplin_frequency=edf_raw._raw_extras[0]['max_samp'] original_event = edf_raw.find_edf_events() annot_list = zip(original_event[1], original_event[4], original_event[2]) events_from_edf.extend(annot_list) events_from_edf = np.array(events_from_edf) i = 0 events_arr = np.zeros(events_from_edf.shape, dtype=int) for i_event in events_from_edf: index = int((float(i_event[0])) * samplin_frequency) events_arr[i,:] = index,0,stim_code[int(i_event[2])] i=i+1 # strip channel names of "." characters edf_raw.rename_channels(lambda x: x.strip('.')) #create Event dictionary based on File events_in_edf = [event[2] for event in events_arr[:]] if(events_in_edf.__contains__(2)): event_id['LEFT_HAND']=2 if (events_in_edf.__contains__(3)): event_id['RIGHT_HAND'] = 3 # if (events_in_edf.__contains__(4)): # event_id['FEET'] = 4 # if (events_in_edf.__contains__(5)): # event_id['IDLE'] = 5 # Apply band-pass filter edf_raw.filter(4., 40., fir_design='firwin', skip_by_annotation='edge') # 4-40Hz picks = pick_types(edf_raw.info, meg=False, eeg=True, stim=False, eog=False, exclude='bads') print events_arr[:10] # Read epochs (train will be done only between 0.5 and 2.5s) # Testing will be done with a running classifier print edf_raw if event_id: epochs = Epochs(edf_raw, events_arr, event_id, tmin, tmax, proj=True, picks=picks, baseline=None, preload=True) tmaximum =2.5 tminimum = 0.5 epochs_train = [] while (tmaximum<4.1): epochs_train.append(epochs.copy().crop(tmin=tminimum, tmax=tmaximum)) tminimum=tminimum+0.1 tmaximum=tmaximum+0.1 labels = [epochs_from_train.events[:, -1] - 2 for epochs_from_train in epochs_train] labels_array = np.array(labels) epochs_data = epochs.get_data() epochs_data_train = [epochs_from_train.get_data() for epochs_from_train in epochs_train] epochs_array_train = np.array(epochs_data_train) #split data into training and testing set i=0 cv = ShuffleSplit(10, test_size=0.2, random_state=42) if (len(epochs_data_train) != len(labels_array)): print "Something is not right" else: while (i<len(epochs_array_train)): X.extend(epochs_array_train[i]) y.extend(labels_array[i]) # for train_idx, test_idx in cv_split: # #X_train, X_test = epochs_array_train[train_idx],epochs_array_train[test_idx] # #y_train, y_test = labels[train_idx], labels[test_idx] # X_train.append(epochs_array_train[i][train_idx]) # X_test.append(epochs_array_train[i][test_idx]) # y_train.append(labels_array[i][train_idx]) # y_test.append(labels_array[i][test_idx]) i=i+1 cv_split = cv.split(X) X = np.array(X) y = np.array(y) # for train_idx, test_idx in cv_split: # X_train.append(X[train_idx]) # X_test.append(X[test_idx]) # y_train.append(y[train_idx]) # y_test.append(y[test_idx]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) return np.array(X_train),np.array(y_train),np.array(X_test),np.array(y_test) #return X,y # if __name__ == '__main__': # data_directory = 'data_i2r'; # #user = '******' # (X_train,y_train,X_test,y_test)=load_data(training=True) # print ("train data size is " + str(X_train.size)) # print ("test data size is "+ str(X_test.size))
def run_eval(dataset, iterations): suffixes = ['Zafar et al.', 'Adaboost', 'AdaFair', 'SMOTEBoost'] if dataset == "compass-gender": X, y, sa_index, p_Group, x_control = load_compas("sex") elif dataset == "compass-race": X, y, sa_index, p_Group, x_control = load_compas("race") elif dataset == "adult-gender": X, y, sa_index, p_Group, x_control = load_adult("sex") elif dataset == "adult-race": X, y, sa_index, p_Group, x_control = load_adult("race") elif dataset == "bank": X, y, sa_index, p_Group, x_control = load_bank() elif dataset == "kdd": X, y, sa_index, p_Group, x_control = load_kdd() else: exit(1) create_temp_files(dataset, suffixes) # init parameters for zafar method (default settings) tau = 3.0 mu = 1.2 cons_type = 4 sensitive_attrs = x_control.keys() loss_function = "logreg" EPS = 1e-6 sensitive_attrs_to_cov_thresh = { sensitive_attrs[0]: { 0: { 0: 0, 1: 0 }, 1: { 0: 0, 1: 0 }, 2: { 0: 0, 1: 0 } } } cons_params = { "cons_type": cons_type, "tau": tau, "mu": mu, "sensitive_attrs_to_cov_thresh": sensitive_attrs_to_cov_thresh } threads = [] mutex = [] for lock in range(0, 8): mutex.append(Lock()) random.seed(int(time.time())) for iter in range(0, iterations): sss = ShuffleSplit(n_splits=1, test_size=0.5) for train_index, test_index in sss.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] for proc in range(0, 4): if proc != 2: time.sleep(1) continue if proc > 0: threads.append( Process(target=train_classifier, args=(X_train, X_test, y_train, y_test, sa_index, p_Group, dataset + suffixes[proc], mutex[proc], proc, 200, 1))) elif proc == 0: temp_x_control_train = defaultdict(list) temp_x_control_test = defaultdict(list) temp_x_control_train[sensitive_attrs[0]] = x_control[ sensitive_attrs[0]][train_index] temp_x_control_test[sensitive_attrs[0]] = x_control[ sensitive_attrs[0]][test_index] x_zafar_train, y_zafar_train, x_control_train = ut.conversion( X[train_index], y[train_index], dict(temp_x_control_train), 1) x_zafar_test, y_zafar_test, x_control_test = ut.conversion( X[test_index], y[test_index], dict(temp_x_control_test), 1) threads.append( Process(target=train_zafar, args=(x_zafar_train, y_zafar_train, x_control_train, x_zafar_test, y_zafar_test, x_control_test, cons_params, loss_function, EPS, dataset + suffixes[proc], mutex[proc], sensitive_attrs))) for process in threads: process.start() for process in threads: process.join() threads = [] results = [] for suffix in suffixes: infile = open(dataset + suffix, 'rb') temp_buffer = pickle.load(infile) results.append(temp_buffer.performance) infile.close() plot_my_results(results, suffixes, "Images/" + dataset, dataset) delete_temp_files(dataset, suffixes)
# test y_pred = grid_search_cv.predict(X_test) print(accuracy_score(y_test, y_pred)) # 0.8695 # 8: grow a forest # sub-data set n_trees = 1000 n_instances = 100 mini_sets = [] # split X_train into 1000 pieces and each piece has 100 samples reshuffled rs = ShuffleSplit(n_splits=n_trees, test_size=len(X_train) - n_instances, random_state=42) # refer to its definition and cloud note: sklearn for mini_train_index, mini_test_index in rs.split(X_train): X_mini_train = X_train[mini_train_index] y_mini_train = y_train[mini_train_index] mini_sets.append((X_mini_train, y_mini_train)) # train tree models in forest forest = [clone(grid_search_cv.best_estimator_) for _ in range(n_trees)] # copy: deep copy # fit and predict: train 1000 tree models with X_mini_train and predict accuracy_scores = [] for tree, (X_mini_train, y_mini_train) in zip(forest, mini_sets): tree.fit(X_mini_train, y_mini_train) y_pred = tree.predict(X_test) accuracy_scores.append(accuracy_score(y_test, y_pred)) print(np.mean(accuracy_scores)) # mean accuracy of all test data: 0.8054494999999999
from sklearn.model_selection import ShuffleSplit import numpy as np X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 2, 1, 2]) rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0) rs.get_n_splits(X) print(rs) ShuffleSplit(n_splits=3, random_state=0, test_size=0.25, train_size=None) for train_index, test_index in rs.split(X): print("TRAIN:", train_index, "TEST:", test_index) rs = ShuffleSplit(n_splits=3, train_size=0.5, test_size=.25, random_state=0) for train_index, test_index in rs.split(X): print("# TRAIN:", train_index, "TEST:", test_index)
from sklearn.preprocessing import MinMaxScaler myMinMaxScaler = MinMaxScaler() Xsc = myMinMaxScaler.fit_transform(Xraw) from sklearn.decomposition import PCA mypca = PCA(n_components=10) #X = mypca.fit_transform(Xsc) X = Xsc from sklearn.model_selection import ShuffleSplit rs = ShuffleSplit(n_splits=1, test_size=0.2) for train_index, test_index in rs.split(X): Xtrain = X[train_index, :] Xtest = X[test_index, :] ytrain = y[train_index] ytest = y[test_index] from sklearn.metrics import confusion_matrix myclf = GaussianNB() myclf.fit(Xtrain, ytrain) ypred = myclf.predict(Xtest) print confusion_matrix(ytest, ypred) from sklearn.model_selection import cross_val_score
for n in nFeats ]) brca1Model20 = copy.deepcopy(brca1Modelers[20]).fit(x0, y) brca1Preds = brca1Model20.predict(x0) stats.pearsonr(brca1Preds, y)[0] brca1Model1000 = copy.deepcopy(brca1Modelers[1000]).fit(x0, y) brca1Preds = brca1Model1000.predict(x0) stats.pearsonr(brca1Preds, y)[0] cvR2s_unreg = Series(OrderedDict([ (n, np.mean(cross_val_score_pd(copy.deepcopy(brca1Modelers[n]), X = x0, y = y, cv = cvSched.split(x0)))) for n in nFeats ])) ## ----------------------------------------------------------------- ## L2-regularized linear regression ## ----------------------------------------------------------------- brca1Modelers2 = OrderedDict([ (n, pipeline.Pipeline([ ('featsel', feature_selection.SelectKBest( feature_selection.f_regression, k=n)), ('regressor', linear_model.Ridge( alpha=len(y)*(1.5 + 0.034*n))) ])) for n in nFeats
def train_model(clf, param_grid, X, Y): '''Trains and evaluates the model clf from input The function selects the best model of clf by optimizing for the validation data, then evaluates its performance using the out of sample test data. input - clf: the model to train param_grid: a dict of hyperparameters to use for optimization X: features Y: labels output - the best estimator (trained model) the confusion matrix from classifying the test data ''' #First, partition into train and test data X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) n_iter = 5 #If number of possible iterations are less than prefered number of iterations, #set it to the number of possible iterations #number of possible iterations are not less than prefered number of iterations if any argument is expon() #because expon() is continous (writing 100 instead, could be any large number) n_iter = min(n_iter,np.prod([ 100 if type(xs) == type(expon()) else len(xs) for xs in param_grid.values() ])) #perform a grid search for the best parameters on the training data. #Cross validation is made to select the parameters, so the training data is actually split into #a new train data set and a validation data set, K number of times cvv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0) #DEBUG: n_iter=10 cv = cvv.split(X_train) #cv = KFold(n=len(X), n_folds=10) random_grid_search = RandomizedSearchCV( clf, param_distributions=param_grid, cv=cv, scoring='f1', n_iter=n_iter, #DEBUG 1 random_state=5, refit=True, verbose=10, n_jobs=-1 # modify ) '''Randomized search used instead. We have limited computing power grid_search = GridSearchCV( clf, param_grid=param_grid, cv=cv, scoring='f1', #accuracy/f1/f1_weighted all give same result? verbose=10, n_jobs=-1 ) grid_search.fit(X_train, Y_train) ''' random_grid_search.fit(X_train, Y_train) #Evaluate the best model on the test data Y_test_predicted = random_grid_search.best_estimator_.predict(X_test) Y_test_predicted_prob = random_grid_search.best_estimator_.predict_proba(X_test)[:, 1] confusion = confusion_matrix(Y_test, Y_test_predicted) TP = confusion[1, 1] TN = confusion[0, 0] FP = confusion[0, 1] FN = confusion[1, 0] #Calculate recall (sensitivity) from confusion matrix sensitivity = TP / float(TP + FN) #Calculate specificity from confusion matrix specificity = TN / float(TN + FP) #Calculate accuracy accuracy = (confusion[0][0] + confusion[1][1]) / (confusion.sum().sum()) #Calculate axes of ROC curve fpr, tpr, thresholds = roc_curve(Y_test, Y_test_predicted_prob) #Area under the ROC curve auc = roc_auc_score(Y_test, Y_test_predicted_prob) return { 'conf_matrix':confusion, 'accuracy':accuracy, 'sensitivity':sensitivity, 'specificity':specificity, 'auc':auc, 'params':random_grid_search.best_params_, 'model':random_grid_search.best_estimator_, 'roc':{'fpr':fpr,'tpr':tpr,'thresholds':thresholds} }