def train_svm(C=0.1, grid=False): ds = PascalSegmentation() svm = LinearSVC(C=C, dual=False, class_weight='auto') if grid: data_train = load_pascal("kTrain") X, y = shuffle(data_train.X, data_train.Y) # prepare leave-one-label-out by assigning labels to images image_indicators = np.hstack([np.repeat(i, len(x)) for i, x in enumerate(X)]) # go down to only 5 "folds" labels = image_indicators % 5 X, y = np.vstack(X), np.hstack(y) cv = LeavePLabelOut(labels=labels, p=1) param_grid = {'C': 10. ** np.arange(-3, 3)} scorer = Scorer(recall_score, average="macro") grid_search = GridSearchCV(svm, param_grid=param_grid, cv=cv, verbose=10, scoring=scorer, n_jobs=-1) grid_search.fit(X, y) else: data_train = load_pascal("train") X, y = np.vstack(data_train.X), np.hstack(data_train.Y) svm.fit(X, y) print(svm.score(X, y)) eval_on_sp(ds, data_train, [svm.predict(x) for x in data_train.X], print_results=True) data_val = load_pascal("val") eval_on_sp(ds, data_val, [svm.predict(x) for x in data_val.X], print_results=True)
def tuneAndTrainTest(estimatorType, data, labels, numFolds, patientIds, roundNum, estimator, lossFunction=lossFunction): folds = LeavePLabelOut(patientIds[0], p=2) errors = [] counter = 0 for trainIndices, testIndices in folds: #Tuning trainData = [data[i] for i in trainIndices] trainlabels = [labels[i] for i in trainIndices] testData = [data[i] for i in testIndices] testLabels = [labels[i] for i in testIndices] #Training if roundNum == 0: estimator.fit(trainData, trainlabels) else: estimator = optimizeHyperParams(trainData, trainlabels, estimatorType) #Testing errors.append(lossFunction(estimator, testData, testLabels)) print np.array(errors).mean()
def tuneAndTrain(predictorType, data, labels, patientIds, numFolds, lossFunction=lossFunction): ''' :param data: data matrix, a structured array. :param predictorType: one of: {'SVM', 'RF'} :param patientIds: May be ints, strings, etc. denoting which patient every line is taken from. :return: the mean error of an optimized predictor of type predictorType, and the optimized, trained predictor. ''' folds = LeavePLabelOut(patientIds, p=2) # folds = [tup for tup in folds] errors = [] for trainIndices, testIndices in folds[:numFolds]: #Tuning #get actual data and labels for current fold trainData = data[trainIndices] trainLabels = labels[trainIndices] testData = data[testIndices] testLabels = labels[testIndices] testNames = patientIds[testIndices] if np.all(trainLabels == trainLabels[0]): continue #can't train on elements that are all from the same group # todo: for testing - skip feature selection # selectedFeatures = SelectFeatures(trainData, trainLabels) # selectedTrainData = [trainData[f] for f in selectedFeatures] # selectedTestData = [testData[f] for f in selectedFeatures] selectedTrainData = trainData selectedTestData = testData # to get the data in a list-of-list format that optimizeHyperParams expects. selectedTrainData = [list(tup) for tup in selectedTrainData] selectedTestData = [list(tup) for tup in selectedTestData] predictor = optimizeHyperParams(selectedTrainData, trainLabels, predictorType) #Training #predictor.fit(selectedFeaturesTrainData, trainLabels) //optimizeHyperParams also trains #Testing errors.append( lossFunction(predictor, selectedTestData, testLabels, testNames)) return np.array(errors).mean()
def twoStepsLoss(estimator, X, y, names): print names folds = LeavePLabelOut(names, p=1) #ugly patch - correct syntax? folds = [tup for tup in folds] loss = 0.0 for testIndices in folds[:2]: print "in fold" testData = X[testIndices] testLabels = y[testIndices] testLabel = (y[testIndices])[0] sickCount = 0.0 for data in testData: sickCount = sickCount + (estimator.predict(data)[0]) sickCount = sickCount / len(X) if sickCount > 0.5: #mostly sick if testLabel == 0: loss = loss + 1 else: if testLabel == 1: loss = loss + 1 loss = 1 - (loss / 2) return loss
fmri, stimuli, onsets, conditions = read_data_gauthier(subject) session_id_onset = np.load('sessions_id_onset.npy') session_id_onset = [[19.2 / len(onsets[session])] * len(onsets[session]) for session in range(len(onsets))] betas, reg = de.glm(fmri, tr, onsets, hrf_model=hrf_model, drift_model='blank', model=model) betas = np.vstack(betas) conditions = np.hstack(conditions) session_id_onset = np.hstack(session_id_onset) lplo = LeavePLabelOut(session_id_onset, p=1) for train_index, test_index in lplo: # Split into train and test sets betas_isi = betas[test_index] conditions_isi = conditions[test_index] n_points = len(conditions_isi) if n_points == 12 * 4: isi = 1.6 elif n_points == 6 * 4: isi = 3.2 elif n_points == 4 * 4: isi = 4.8
# Read data fmri, stimuli, onsets, conditions = read_data_mrt(subject) session_id_onset = [[session] * len(onsets[session]) for session in range(len(onsets))] betas, reg = de.glm(fmri, tr, onsets, hrf_model=hrf_model, model=model) betas = np.vstack(betas) session_id_onset = np.hstack(session_id_onset) conditions = np.hstack(conditions) junk_mask = np.where(conditions != 'ju') conditions = conditions[junk_mask] betas = betas[junk_mask] session_id_onset = session_id_onset[junk_mask] lplo = LeavePLabelOut(session_id_onset, p=1) for train_index, test_index in lplo: # Split into train and test sets betas_train, betas_test = betas[train_index], betas[test_index] conditions_train, conditions_test = (conditions[train_index], conditions[test_index]) # Feature selection betas_train, betas_test = de.feature_selection(betas_train, betas_test, conditions_train, k=k) # Fit a logistic regression to score the model accuracy = de.glm_scoring(betas_train, betas_test, conditions_train, conditions_test)
print(train, test) ''' Leave-P-Label-Out LeavePLabelOut is similar as Leave-One-Label-Out, but removes samples related to P labels for each training/test set. ''' from sklearn.cross_validation import LeavePLabelOut X = [[0., 0.], [1., 1.], [-1., -1.], [2., 2.], [3., 3.], [4., 4.]] Y = [0, 1, 0, 1, 0, 1] labels = [1, 1, 2, 2, 3, 3] lplo = LeavePLabelOut(labels, 2) print(lplo) for train, test in lplo: print(train, test) ''' Random permutations cross-validation a.k.a. Shuffle & Split¶ ShuffleSplit The ShuffleSplit iterator will generate a user defined number of independent train / test dataset splits. Samples are first shuffled and then splitted into a pair of train and test sets. It is possible to control the randomness for reproducibility of the results by explicitly seeding the random_state pseudo random number generator. Here is a usage example: "Random permutation cross-validator
#scores = -np.log10(selector.pvalues_) #scores /= scores.max() #print "selected factors:\n" #support=selector.get_support(indices=True) #selected= header[selector.get_support(indices=True)] #for i,item in enumerate(selected): # print item," ",scores[support[i]] #print X.shape #X=selector.transform(X) #print X.shape # ridge lol = LeaveOneLabelOut(offers) lop = LeavePLabelOut(offers, len(offers_set) - 1) cvset = lop rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, oob_score=True, max_depth=5, min_samples_leaf=5) #0.675 xrf = ExtraTreesClassifier(n_estimators=100, n_jobs=-1) #0.662 dt = DecisionTreeClassifier() #0.559 lr = LogisticRegression(C=0.001) # 0.654 gbr = GradientBoostingRegressor() # 0.691 #gbr1=GradientBoostingRegressor(loss="quantile",alpha=0.6) # 0.542 gbr1 = GradientBoostingRegressor(loss='quantile', alpha=0.5) #0.544 gbc = GradientBoostingClassifier() #0.692
de.design_matrix(len(fmri[session]), tr, onsets[session], conditions[session], hrf_model=hrf_model, durations=durations[session]) for session in range(len(fmri)) ] # Stack the BOLD signals and the design matrices fmri = np.vstack(fmri) design = np.vstack(design) stimuli = np.vstack(stimuli) session_id_fmri = np.hstack(session_id_fmri) lplo = LeavePLabelOut(session_id_fmri, p=2) for train_index, test_index in lplo: # Split into train and test sets fmri_train, fmri_test = fmri[train_index], fmri[test_index] design_train, design_test = design[train_index], design[test_index] stimuli_train, stimuli_test = stimuli[train_index], stimuli[test_index] # Feature selection fmri_train, fmri_test = de.feature_selection( fmri_train, fmri_test, np.argmax(stimuli_train, axis=1)) # Fit a ridge regression to predict the design matrix prediction_test, prediction_train, score = de.fit_ridge( fmri_train, fmri_test, design_train,
data_train,data_test,target_train,target_test=\ train_test_split(housing.data,housing.target,test_size=0.1,random_state=42) dtr=tree.DecisionTreeRegressor() dtr.fit(data_train,target_train) #score=dtr.score(data_test,target_test) kf=KFold(data_train.shape[0],n_folds=5) skf=StratifiedKFold(target_train,n_folds=5) loo=LeaveOneOut(data_train.shape[0]) lpo=LeavePOut(data_train.shape[0],6) labels_lolo=[1,1,2,2] lolo=LeaveOneLabelOut(labels_lolo) #这个策略在划分样本时,会根据第三方提供的整数型样本类标号进行划分 #每次划分数据集时,去除某个属于某个类裱好的样本作为测试集,剩余的作为训练集 labels_lopo=[1,1,2,2,3,3] lopo=LeavePLabelOut(labels_lopo,2) #这个策略每次取得p种类标号的数据作为测试集,其余作为训练集 #注意cross_val_score中的cv参数 cross_score=cross_val_score(dtr,data_train,target_train,cv=skf) print("交叉验证:") print(cross_score) #print(score) ''' #这种无法使用metrics进行判断,原因在于continous is not supported pred_test=dtr.predict(data_test) acc_score=metrics.accuracy_score(target_test,pred_test) print(acc_score) '''
n_classes = 12 network = SegNet(in_channels=3, n_classes=n_classes) network.init_encoder() network.cuda() net = skorch.NeuralNet( module=network, criterion=torch.nn.CrossEntropyLoss, train_split=None, use_cuda=True, batch_size=10, ) params = {'lr': [0.01, 0.02], 'max_epochs': [5, 10]} # if only training # net.fit(X=X, y=y) image_indicators = np.hstack([np.repeat(i, len(x)) for i, x in enumerate(X)]) labels = image_indicators % n_classes # X, y = np.vstack(X), np.hstack(Y) cv = LeavePLabelOut(labels=labels, p=1) gs = GridSearchCV(net, params, scoring='f1', verbose=10, cv=cv, n_jobs=-1) gs.fit(X=X, y=Y) print(gs.best_score_, gs.best_params_)
design = [ de.design_matrix(len(fmri[session]), tr, onsets[session], conditions[session], hrf_model=hrf_model, drift_model='blank') for session in range(len(fmri)) ] # Stack the BOLD signals and the design matrices fmri = np.vstack(fmri) design = np.vstack(design) stimuli = np.vstack(stimuli) session_id_fmri = np.hstack(session_id_fmri) lplo = LeavePLabelOut(session_id_fmri, p=1) for train_index, test_index in lplo: # Split into train and test sets _, fmri_isi = fmri[train_index], fmri[test_index] _, design_isi = design[train_index], design[test_index] _, stimuli_isi = stimuli[train_index], stimuli[test_index] n_points = np.sum(stimuli_isi[:, 1:]) if n_points == 12 * 4: isi = 1.6 logistic_window = 4 delay = 1 elif n_points == 6 * 4: isi = 3.2 logistic_window = 4
def fit(self, X_task, y, sub_ids): DEBUG_FLAG = True # self.max_epochs = 333 self.batch_size = 100 n_input = X_task.shape[1] # sklearn-like structure n_output = n_input rng = np.random.RandomState(42) self.input_taskdata = T.matrix(dtype='float32', name='input_taskdata') self.params_from_last_iters = [] index = T.iscalar(name='index') # prepare data for theano computation if not DEBUG_FLAG: X_train_s = theano.shared(value=np.float32(X_task), name='X_train_s') y_train_s = theano.shared(value=np.int32(y), name='y_train_s') else: # from sklearn.cross_validation import StratifiedShuffleSplit # folder = StratifiedShuffleSplit(y, n_iter=1, test_size=0.20) # new_trains, inds_val = iter(folder).next() from sklearn.cross_validation import LeavePLabelOut folder = LeavePLabelOut(sub_ids, p=1) new_trains, inds_val = iter(folder).next() # valid_subs = np.array([1107, 1109, 1110, 1114, # healthy # 2105, 2106, 2113, 2125, # depression # 3280, 3279, 3276, 3275]) # sz # inds_val = np.in1d(sub_ids, valid_subs) # new_trains = np.logical_not(inds_val) print('Data points in train set: %i' % np.sum(new_trains)) print('Data points in validation set: %i' % np.sum(inds_val)) print('Data features: %i' % n_input) X_train, X_val = X_task[new_trains], X_task[inds_val] y_train, y_val = y[new_trains], y[inds_val] X_train_s = theano.shared(value=np.float32(X_train), name='X_train_s', borrow=False) y_train_s = theano.shared(value=np.int32(y_train), name='y_train_s', borrow=False) # X_val_s = theano.shared(value=np.float32(X_val), # name='X_train_s', borrow=False) # y_val_s = theano.shared(value=np.int32(y_val), # name='y_cal_s', borrow=False) self.dbg_epochs_ = list() self.dbg_acc_train_ = list() self.dbg_acc_val_ = list() self.dbg_ae_cost_ = list() self.dbg_lr_cost_ = list() self.dbg_ae_nonimprovesteps = list() self.dbg_acc_other_ds_ = list() self.dbg_combined_cost_ = list() self.dbg_prfs_ = list() self.dbg_prfs_other_ds_ = list() train_samples = len(X_train) # V -> supervised / logistic regression # W -> unsupervised / auto-encoder # computational graph: auto-encoder W0_vals = rng.randn(n_input, self.n_hidden).astype( np.float32) * self.gain1 self.W0s = theano.shared(W0_vals) # self.W1s = self.W0s.T # tied W1_vals = rng.randn(self.n_hidden, n_input).astype( np.float32) * self.gain1 self.W1s = theano.shared(W1_vals) bW0_vals = np.zeros(self.n_hidden).astype(np.float32) self.bW0s = theano.shared(value=bW0_vals, name='bW0') bW1_vals = np.zeros(n_output).astype(np.float32) self.bW1s = theano.shared(value=bW1_vals, name='bW1') encoding = (self.input_taskdata.dot(self.W0s) + self.bW0s).dot( self.W1s) + self.bW1s self.ae_loss = T.sum((self.input_taskdata - encoding)**2, axis=1) self.ae_cost = (T.mean(self.ae_loss) / n_input) # params1 = [self.W0s, self.bW0s, self.bW1s] # gparams1 = [T.grad(cost=self.ae_cost, wrt=param1) for param1 in params1] # # lr = self.learning_rate # updates = self.RMSprop(cost=self.ae_cost, params=params1, # lr=self.learning_rate) # f_train_ae = theano.function( # [index], # [self.ae_cost], # givens=givens_ae, # updates=updates) # computation graph: logistic regression clf_n_output = len(np.unique(y)) print('SSFLogreg: Fitting %i classes' % clf_n_output) my_y = T.ivector(name='y') bV0_vals = np.zeros(self.n_hidden).astype(np.float32) self.bV0 = theano.shared(value=bV0_vals, name='bV0') bV1_vals = np.zeros(clf_n_output).astype(np.float32) self.bV1 = theano.shared(value=bV1_vals, name='bV1') # V0_vals = rng.randn(n_input, self.n_hidden).astype(np.float32) * self.gain1 # self.V0s = theano.shared(V0_vals) V1_vals = rng.randn(self.n_hidden, clf_n_output).astype( np.float32) * self.gain1 self.V1s = theano.shared(V1_vals) self.p_y_given_x = T.nnet.softmax( # T.dot(T.dot(self.input_taskdata, self.V0s) + self.bV0, self.V1s) + self.bV1 T.dot(T.dot(self.input_taskdata, self.W0s) + self.bV0, self.V1s) + self.bV1) self.lr_cost = -T.mean( T.log(self.p_y_given_x)[T.arange(my_y.shape[0]), my_y]) self.lr_cost = ( self.lr_cost + T.mean(abs(self.W0s)) * self.penalty_l1 + # T.mean(abs(self.V0s)) * self.penalty_l1 + T.mean(abs(self.bV0)) * self.penalty_l1 + T.mean(abs(self.V1s)) * self.penalty_l1 + T.mean(abs(self.bV1)) * self.penalty_l1 + T.mean( (self.W0s**np.float32(2))) * self.penalty_l2 + # T.mean((self.V0s ** 2)) * self.penalty_l2 + T.mean((self.bV0**np.float32(2))) * self.penalty_l2 + T.mean( (self.V1s**np.float32(2))) * self.penalty_l2 + T.mean( (self.bV1**np.float32(2))) * self.penalty_l2) self.y_pred = T.argmax(self.p_y_given_x, axis=1) # params2 = [self.V0s, self.bV0, self.V1s, self.bV1] # params2 = [self.W0s, self.bV0, self.V1s, self.bV1] # updates2 = self.RMSprop(cost=self.lr_cost, params=params2, # lr=self.learning_rate) # f_train_lr = theano.function( # [index], # [self.lr_cost], # givens=givens_lr, # updates=updates2) self.covar_cost = T.dot(self.W0s.T, self.W0s) - T.eye( self.W0s.shape[1]) self.covar_cost = T.sum(T.sum(self.covar_cost**2, axis=1), axis=0) # Frobenius # combined loss for AE and LR combined_params = [ self.W0s, self.bW0s, self.bW1s, self.W1s, # self.V0s, self.V1s, self.bV0, self.bV1] self.V1s, self.bV0, self.bV1 ] self.combined_cost = ( (np.float32(1) - self.lambda_param) * self.ae_cost + self.lambda_param * self.lr_cost + self.gamma * self.covar_cost) combined_updates = self.RMSprop2(cost=self.combined_cost, params=combined_params, lr=self.learning_rate) givens_combined = { self.input_taskdata: X_train_s[index * self.batch_size:(index + 1) * self.batch_size], my_y: y_train_s[index * self.batch_size:(index + 1) * self.batch_size] } f_train_combined = theano.function( [index], # [self.combined_cost, self.ae_cost, self.lr_cost, self.lr_cost], [self.combined_cost, self.ae_cost, self.lr_cost, self.covar_cost], givens=givens_combined, updates=combined_updates, allow_input_downcast=False) # optimization loop start_time = time.time() ae_last_cost = np.inf lr_last_cost = np.inf no_improve_steps = 0 acc_train, acc_val = 0., 0. for i_epoch in range(self.max_epochs): if i_epoch == 1: epoch_dur = time.time() - start_time total_mins = (epoch_dur * self.max_epochs) / 60 hs, mins = divmod(total_mins, 60) print("Max estimated duration: %i hours and %i minutes" % (hs, mins)) # AE n_batches = train_samples // self.batch_size for i in range(n_batches): # lr_cur_cost = f_train_lr(i)[0] # ae_cur_cost = lr_cur_cost combined_cost, ae_cur_cost, lr_cur_cost, covar_cost = f_train_combined( i) # evaluate epoch cost if ae_last_cost - ae_cur_cost < 0.1: no_improve_steps += 1 else: ae_last_cost = ae_cur_cost no_improve_steps = 0 # logistic lr_last_cost = lr_cur_cost acc_train = self.score(X_train, y_train) acc_val, prfs_val = self.score(X_val, y_val, return_prfs=True) print( 'E:%i, ae_cost:%.4f, lr_cost:%.4f, covar_cost:%.4f, train_score:%.2f, vald_score:%.2f, ae_badsteps:%i' % (i_epoch + 1, ae_cur_cost, lr_cur_cost, covar_cost, acc_train, acc_val, no_improve_steps)) # if (i_epoch % 10 == 0): self.dbg_ae_cost_.append(ae_cur_cost) self.dbg_lr_cost_.append(lr_cur_cost) self.dbg_combined_cost_.append(combined_cost) self.dbg_epochs_.append(i_epoch + 1) self.dbg_ae_nonimprovesteps.append(no_improve_steps) self.dbg_acc_train_.append(acc_train) self.dbg_acc_val_.append(acc_val) self.dbg_prfs_.append(prfs_val) # save paramters from last 100 iterations # if i_epoch > (self.max_epochs - 100): # print('Param pool!') param_pool = self.get_param_pool() self.params_from_last_iters.append(param_pool) total_mins = (time.time() - start_time) / 60 hs, mins = divmod(total_mins, 60) print("Final duration: %i hours and %i minutes" % (hs, mins)) return self
def main(): n_samples, step = 40, 20 load_data = LoadHAR(add_pitch=False, add_roll=False, add_filter=False, n_samples=n_samples, step=step, normalize='channels', comp_magnitude=False, simple_labels=True, common_labels=False) conf = ModelConfiguration() conf.load_datasets([load_data.uci_hapt], label_limit=6) user_idx = -1 user = None # 'UCI HAPT10' if user is not None: train_idx = conf.users != user test_idx = conf.users == user conf.cv = ((train_idx, test_idx), ) print('Testing user: %s' % user) else: # Cross validate on users conf.cv = LeavePLabelOut(conf.users, p=1) # Divide into K folds balanced on labels # conf.cv = StratifiedKFold(conf.users, n_folds=10) # And shuffle # conf.cv = StratifiedShuffleSplit(np.argmax(conf.y, axis=1), n_iter=1, test_size=0.1, random_state=None) # Pure shuffle # conf.cv = ShuffleSplit(conf.y.shape[0], n_iter=2, test_size=0.1) for train_index, test_index in conf.cv: conf.user = user model = tconvRNN(n_in=(n_samples, conf.n_features), n_filters=[64, 64, 64, 64], filter_sizes=[5] * 4, pool_sizes=[0] * 4, n_hidden=[128, 128], conv_dropout=0.3, rnn_in_dropout=0.0, rnn_hid_dropout=0.0, output_dropout=0.5, n_out=conf.n_classes, trans_func=leaky_rectify, out_func=softmax, stats=conf.stats) if len(conf.cv) > 1: user_idx += 1 if len(conf.cv) == len(conf.user_names): conf.user = conf.user_names[user_idx] else: conf.user = conf.name + ' K_%d' % user_idx # Generate root path and edit root_path = model.get_root_path() model.root_path = "%s_cv_%s_%s" % (root_path, conf.d, conf.user) paths.path_exists(model.root_path) rmdir(root_path) scriptpath = path.realpath(__file__) filename = path.basename(scriptpath) print(scriptpath, model.root_path, filename) shutil.copy(scriptpath, model.root_path + '/' + filename) train = TrainModel(model=model, anneal_lr=0.9, anneal_lr_freq=1, output_freq=1, pickle_f_custom_freq=100, f_custom_eval=None) train.pickle = False train.write_to_logger( "Using StratifiedShuffleSplit with n_iter=1, test_size=0.1, random_state=None" ) conf.run(train_index, test_index, lr=0.003, n_epochs=500, model=model, train=train, load_data=load_data, batch_size=100)