def evalModel(train_data, eval_data, train_labels, eval_labels, seed): joined_data = np.concatenate((train_data,eval_data),axis=0) joined_labels = np.concatenate((train_labels,eval_labels),axis=0) train_mask = np.zeros(train_data.shape[0]) - 1.0 eval_mask = np.zeros(eval_data.shape[0]) joined_mask = np.concatenate((train_mask,eval_mask),axis=0) ps = PredefinedSplit(test_fold=joined_mask) loss = make_scorer(get_rmsle, greater_is_better=False) train_data = sparse.csr_matrix(train_data) eval_data = sparse.csr_matrix(eval_data) clf = RandomForestRegressor(random_state=seed, verbose=1) #clf.fit(train_data, train_labels) #preds = clf.predict(eval_data) #print(get_rmsle(eval_labels, preds)) ## achieves 0.263 # specify parameters and distributions to sample from param_dist = {"n_estimators": sp_randint(300, 800), "max_depth": sp_randint(10, 50), "max_features": ['auto','sqrt','log2'], "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11)} # run randomized search n_iter_search = 60 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, cv=ps, scoring=loss, n_iter=n_iter_search,n_jobs=-1,pre_dispatch='n_jobs',verbose=2) start = time() random_search.fit(joined_data, joined_labels) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_)
def svm(speaker, X_train, y_train, X_test, y_test): ''' change C, gamma ''' ####svm model optimizing### # C from 0.01 to 16384 ( 0.01 * 2 ^14 ) , # num_C = 9 # Cs = 10 ** np.arange(num_C) * 1e-4 # # # gamma for rbf # gammas = [1e-4, 1e-3, 1e-2, 1e-1, 1] param_grid = {'estimator__C': Cs, 'estimator__gamma': gammas} train_val_features = np.concatenate((X_train, X_test), axis=0) train_val_labels = np.concatenate((y_train, y_test), axis=0) test_fold = np.zeros(train_val_features.shape[0]) test_fold[:X_train.shape[0]] = -1 # train set indexs are -1 ps = PredefinedSplit(test_fold=test_fold) model = OneVsRestClassifier(SVC(kernel='rbf')) clf = GridSearchCV(estimator=model, param_grid=param_grid, cv=ps) clf = clf.fit(train_val_features, train_val_labels) # train_score = clf.score(X_train, y_train) # test_score = clf.score(X_test, y_test) # clf_y_train = clf.predict(X_train) # clf_y_test = clf.predict(X_test) # print('speaker {} in svm classification, train accuracy: {}, test accuracy: {}'.format(speaker, train_score,test_score)) # means = clf.cv_results_['mean_test_score'] # stds = clf.cv_results_['std_test_score'] # for mean, std, params in zip(means, stds, clf.cv_results_['params']): # print('%0.3f (+/-%0.3f) for %r' % (mean, std * 2, params)) #print('best params are {}'.format(clf.best_params_)) #print(classification_report(y_test, clf_y_test)) return clf #, train_score, test_score, clf_y_train, clf_y_test
def KLabelFold(labels, n_folds=3, shuffle=False, random_state=None): kfold = KFold(labels.nunique(), n_folds=n_folds, shuffle=shuffle, random_state=random_state) unique_labels = labels.unique() return PredefinedSplit( pd.concat([ labels.isin(i_x[1]) * i_x[0] for i_x in enumerate([unique_labels[mask[1]] for mask in kfold]) ], axis=1).sum(axis=1))
def set_cv(data, best_features): data['CV'] = -1 data.loc[(data['Date'] >= '01-Aug-2014') & (data['Date'] <= '17-Sep-2014'), 'CV'] = 0 data.loc[(data['Date'] >= '01-Aug-2013') & (data['Date'] <= '17-Sep-2013'), 'CV'] = 1 data.loc[(data['Date'] >= '01-Jun-2015') & (data['Date'] <= '17-Jul-2015'), 'CV'] = 2 X = data[data['Set'] > 0].loc[:, best_features].values y = data[data['Set'] > 0].iloc[:, 6].values cv_set = data[data['Set'] > 0].iloc[:, 41].values ps = PredefinedSplit(test_fold=cv_set) return (X, y, ps)
def perform_svm_grid_search(patient_data, classifier, svm_grid, decision_rule_grid, svm_score_func, decision_score_func, estimator_fit_params=None, preictal_time=210): # Separate data into folds estimator_fit_params = estimator_fit_params if estimator_fit_params is not None else {} # decision_score_func_params = decision_score_func_params if decision_score_func_params is not None else {} sorted_container = prepare_train_test_viz_data( patient_data, preictal_time=preictal_time, train_only_interictal=False, ) # consolidate folds data into array, give indices to separate folds for validation fold_data = sorted_container['fold_data'] total_data = [] data_labels = [] fold_numbers = [] running_fold_number = 0 for data_type in fold_data: for fold_number in fold_data[data_type]: total_data.extend(fold_data[data_type][fold_number]['data']) data_labels.extend( [data_type] * np.size(fold_data[data_type][fold_number]['data'], 0)) fold_numbers.extend( [running_fold_number] * np.size(fold_data[data_type][fold_number]['data'], 0)) running_fold_number += 1 total_data = np.array(total_data) # Use custom fold iterator (Predefined Split), along files. cv = PredefinedSplit(fold_numbers) # Perform Cross Validation on data using GridSearchCV cross_validator = GridSearchCV(classifier, svm_grid, scoring=svm_score_func, fit_params=estimator_fit_params, cv=cv, verbose=1) # TODO: with best parameters, optimize other stuff best_estimator = cross_validator.fit(total_data, data_labels) a = 1
def train_model(set, clf, params): """" Keyword arguments: set -- dataset (dictionary) clf -- sklearn model params -- fine-tuning parameter Returns: f1_score train, f1_score valid, f1_score test, best parameter the function: - uses GridSearchCV to find the best hyperparameter for the model - refits the model with the parameters - predicts train, valid and test sets - find respective f1_scores """ train = set['train'] valid = set['valid'] test = set['test'] train_input = train[0] valid_input = valid[0] test_input = test[0] train_truth = train[1] valid_truth = valid[1] test_truth = test[1] if params != None: ''' use predetermined validation set in the cross-validation 1) Combine training and validation set into one big training set 2) set test_fold vectors - 0 for validation entries, -1 for training entries 3) feed the split into GridSearchCV ''' combine_input = sparse.vstack([train_input, valid_input]) combine_truth = np.concatenate((train_truth, valid_truth)) fold = [-1 for i in range(train_input.shape[0]) ] + [0 for i in range(valid_input.shape[0])] ps = PredefinedSplit(test_fold=fold) clf = GridSearchCV(clf, params, cv=ps, refit=True) clf.fit(combine_input, combine_truth) else: clf.fit(train_input, train_truth) best_param = None if params == None else clf.best_params_ f1_train = f1_score(train_truth, clf.predict(train_input), average=average) f1_valid = f1_score(valid_truth, clf.predict(valid_input), average=average) f1_test = f1_score(test_truth, clf.predict(test_input), average=average) return f1_train, f1_valid, f1_test, best_param
def check_xgb_model(train, valid, predictors): classifier = lambda: XGBClassifier(objective='binary:logistic', silent=True, booster='gbtree', learning_rate=0.1, n_estimators=300, max_depth=5, min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1, n_jobs=20, reg_alpha=0, reg_lambda=1, seed=100) model = Pipeline(steps=[('en', classifier())]) parameters = { #'en__n_estimators':[100, 300, 500, 700, 1000], #'en__max_depth':range(3,10,2), #'en__min_child_weight':np.arange(1,2.5,0.1), #'en__gamma':[i/10.0 for i in range(0,6)] #'en__subsample':[i/100.0 for i in range(75,90, 5)], #'en__colsample_bytree':[i/100.0 for i in range(75, 90, 5)] #'en__reg_alpha':[1e-5, 1e-2, 0.1, 0, 1, 10, 100], #'en__reg_lambda':[1e-5, 1e-2, 0.1, 0, 1, 10, 100], } data = pd.concat([train, valid]) print(data[predictors].head()) print("train size:%s, val size:%s, data size:%s" % (train.shape[0], valid.shape[0], data.shape[0])) index = np.zeros(data.shape[0]) index[:train.shape[0]] = -1 ps = PredefinedSplit(test_fold=index) grid_search = GridSearchCV(model, parameters, cv=ps, n_jobs=-1, verbose=1, scoring='roc_auc') grid_search = grid_search.fit(data[predictors], data['label']) return grid_search
def create_cv_from_trials(tnums, test_perc=.1, n_iter=5): """Create a cross validation object using trial numbers. This is similar to LeavePLabelOut, but it defines a stopping point and shuffles unique labels before doing the splits. This lets you keep datapoints together in train/test splits. Parameters ---------- tnums : array, dtype int, shape(n_test) The labels to use for permutation. """ tnums = tnums.squeeze() unique_labels = np.unique(tnums).squeeze() n_test = np.floor(unique_labels.shape[0] * test_perc) test_ixs = np.random.permutation(unique_labels)[:n_test * n_iter] test_ixs = test_ixs.reshape([n_iter, n_test]) test_fold = np.zeros_like(tnums) for i, ifold in enumerate(test_ixs): for fld in ifold: test_fold[tnums == fld] = i cv = PredefinedSplit(test_fold) return cv
def check_model(train, valid, predictors): # classifier = lambda: SGDClassifier(loss='log', penalty='elasticnet', fit_intercept=True, max_iter=100, shuffle=True, n_jobs=1, class_weight=None) model = Pipeline(steps=[('ss', StandardScaler()), ('en', classifier())]) parameters = { 'en__alpha': [0.001, 0.01, 0.1], 'en__l1_ratio': [0.001, 0.01, 0.1] } #训练集是train+valid data = pd.concat([train, valid]) #print(data[predictors].head()) print("train size:%s, val size:%s, data size:%s" % (train.shape[0], valid.shape[0], data.shape[0])) #生成验证集 index = np.zeros(data.shape[0]) index[:train.shape[0]] = -1 ps = PredefinedSplit(test_fold=index) grid_search = GridSearchCV(model, parameters, cv=ps, n_jobs=-1, verbose=1, scoring='roc_auc') grid_search = grid_search.fit(data[predictors], data['label']) return grid_search
def stacking(train_ensum, val_ensum, test_ensum, y_train, y_val, y_test): C_params = [i/10 for i in range(1, 10)] + [i for i in range(1, 10, 1)] + [i for i in range(10, 100, 10)] cw_params = [i for i in range(1, 10, 1)] + [i for i in range(10, 100, 10)] params = { 'C': C_params, 'class_weight': [{1:w} for w in cw_params], } train_val_features = np.concatenate((train_ensum, val_ensum), axis = 0) train_val_labels = np.concatenate((y_train, y_val), axis = 0) test_fold = np.zeros(train_val_features.shape[0]) test_fold[:train_ensum.shape[0]] = -1 ps = PredefinedSplit(test_fold = test_fold) lr_stack = GridSearchCV(estimator=LogisticRegression(), param_grid=params, scoring=my_scoring, n_jobs=-1, cv=ps, verbose=0) lr_stack.fit(train_val_features, train_val_labels) lr_stack_train_pred_prob = lr_stack.predict_proba(train_ensum)[:, 1] lr_stack_val__pred_prob = lr_stack.predict_proba(val_ensum)[:, 1] lr_stack_test_pred_prob = lr_stack.predict_proba(test_ensum)[:, 1] a, b = utils.model_key_performance(lr_stack_train_pred_prob, y_train) c, d = utils.model_key_performance(lr_stack_val__pred_prob, y_val) e, f = utils.model_key_performance(lr_stack_test_pred_prob, y_test) return a, b, c, d, e, f
def main(): # pickels filenames saved_classifier_filename = "../classifiers/msu_mfsd.pkl" # load or recompute train features. If none, the train features are not loaded into memory load_train_features = True # retrain or load classifier load_classifier = True # load or recompute test features load_test_features = True # descriptor computer mlbp_feature_computer = feature_computer.FrameFeatureComputer( features.MultiScaleLocalBinaryPatterns((8, 1), (8, 2), (16, 2)) ) # mlbp_feature_computer = feature_computer.FrameFeatureComputer(features.LocalBinaryPatterns(8,1)) ( real_features, spoof_features_per_dir, labels_real, labels_spoof_per_dir, ) = get_features_and_labels(load_train_features, mlbp_feature_computer) # here I should do a cross validation on the features """ param_grid = [ {'C': [0.0001, 0.001, 0.01], 'kernel':['linear'], 'class_weight':['balanced', None]}, {'C': [0.0001, 0.001, 0.01], 'kernel':['rbf'],'gamma':[0.0001, 0.001], 'class_weight':['balanced', None]} ] """ test_fold = dbfeatures.compute_msu_ussa_subjects_folds_arr() ps = PredefinedSplit(test_fold=test_fold) clf = svm.SVC( verbose=True, probability=True, C=0.0001, kernel="linear", class_weight="balanced", ) folds_eer = [] threshes = [] confusion_matrices = [] for train_index, test_index in ps: # split the features into current train and test folds train_features = real_features[train_index] test_features = real_features[test_index] train_labels = labels_real[train_index] test_labels = labels_real[test_index] for i in range(len(spoof_features_per_dir)): train_features = np.concatenate( (train_features, spoof_features_per_dir[i][train_index]), 0 ) test_features = np.concatenate( (test_features, spoof_features_per_dir[i][test_index]), 0 ) train_labels = np.concatenate( (train_labels, labels_spoof_per_dir[i][train_index]), 0 ) test_labels = np.concatenate( (test_labels, labels_spoof_per_dir[i][test_index]), 0 ) # train the classifier clf.fit(train_features, train_labels) # use the classifier to predict the labels for test_features pred_labels = clf.predict(test_features) # create the roc curve fpr, tpr, threshold = roc_curve(test_labels, pred_labels, pos_label=1) # compute the equal error rate eer = brentq(lambda x: 1.0 - x - interp1d(fpr, tpr)(x), 0.0, 1.0) thresh = interp1d(fpr, threshold)(eer) folds_eer.append(eer) threshes.append(thresh) conf_mat = confusion_matrix(test_labels, pred_labels) confusion_matrices.append(conf_mat) # print the mean and standard deviation of equal error rate across the folds print(np.mean(folds_eer), np.std(folds_eer)) for conf_mat in confusion_matrices: print(conf_mat)
# CLASSIFIER # #------------# if not os.path.exists(config['results_folder']): os.makedirs(config['results_folder']) f = open(config['results_folder'] + experiment_name + '.txt', 'w') if config['audios_list'] == False: print('train/val/test partitions are pre-defined!') if config['model_type'] == 'SVM': # hyperparameter search in val set x_dev = np.concatenate((x_train, x_val), axis=0) y_dev = np.concatenate((y_train, y_val), axis=0) val_mask = np.concatenate( (-np.ones(len(y_train)), np.zeros(len(y_val))), axis=0) ps = PredefinedSplit(test_fold=val_mask) svc = SVC() hps = GridSearchCV(svc, svm_params, cv=ps, n_jobs=3, pre_dispatch=3 * 8, verbose=config['SVM_verbose']).fit( x_dev, y_dev) print('Best hyperparameter: ' + str(hps.best_params_)) # define final model model = SVC() model.set_params(**hps.best_params_) else: score_max = 0 h_max = -1
def _cost_fn(argd, X, y, EX_list, valid_size, n_folds, shuffle, random_state, use_partial_fit, info, timeout, _conn, loss_fn=None, best_loss=None): '''Calculate the loss function ''' try: t_start = time.time() # Extract info from calling function. if 'classifier' in argd: classifier = argd['classifier'] regressor = argd['regressor'] preprocessings = argd['preprocessing'] ex_pps_list = argd['ex_preprocs'] else: classifier = argd['model']['classifier'] regressor = argd['model']['regressor'] preprocessings = argd['model']['preprocessing'] ex_pps_list = argd['model']['ex_preprocs'] learner = classifier if classifier is not None else regressor is_classif = classifier is not None untrained_learner = copy.deepcopy(learner) # -- N.B. modify argd['preprocessing'] in-place # Determine cross-validation iterator. if n_folds is not None: if n_folds == -1: info('Will use leave-one-out CV') cv_iter = LeaveOneOut(len(y)) elif is_classif: info('Will use stratified K-fold CV with K:', n_folds, 'and Shuffle:', shuffle) cv_iter = StratifiedKFold(y, n_folds=n_folds, shuffle=shuffle, random_state=random_state) else: info('Will use K-fold CV with K:', n_folds, 'and Shuffle:', shuffle) cv_iter = KFold(len(y), n_folds=n_folds, shuffle=shuffle, random_state=random_state) else: if not shuffle: # always choose the last samples. info('Will use the last', valid_size, 'portion of samples for validation') n_train = int(len(y) * (1 - valid_size)) valid_fold = np.ones(len(y), dtype=np.int) valid_fold[:n_train] = -1 # "-1" indicates train fold. cv_iter = PredefinedSplit(valid_fold) elif is_classif: info( 'Will use stratified shuffle-and-split with validation \ portion:', valid_size) cv_iter = StratifiedShuffleSplit(y, 1, test_size=valid_size, random_state=random_state) else: info('Will use shuffle-and-split with validation portion:', valid_size) cv_iter = ShuffleSplit(len(y), 1, test_size=valid_size, random_state=random_state) # Use the above iterator for cross-validation prediction. cv_y_pool = np.array([]) cv_pred_pool = np.array([]) cv_n_iters = np.array([]) for train_index, valid_index in cv_iter: Xfit, Xval = X[train_index], X[valid_index] yfit, yval = y[train_index], y[valid_index] if EX_list is not None: _EX_list = [(EX[train_index], EX[valid_index]) for EX in EX_list] EXfit_list, EXval_list = zip(*_EX_list) else: EXfit_list = None EXval_list = None XEXfit, XEXval = transform_combine_XEX(Xfit, info, preprocessings, Xval, EXfit_list, ex_pps_list, EXval_list) learner = copy.deepcopy(untrained_learner) info('Training learner', learner, 'on X/EX of dimension', XEXfit.shape) if hasattr(learner, "partial_fit") and use_partial_fit: learner, n_iters = pfit_until_convergence(learner, is_classif, XEXfit, yfit, info, best_loss=best_loss, XEXval=XEXval, yval=yval, timeout=timeout, t_start=t_start) else: learner.fit(XEXfit, yfit) n_iters = None if learner is None: break cv_y_pool = np.append(cv_y_pool, yval) info('Scoring on X/EX validation of shape', XEXval.shape) cv_pred_pool = np.append(cv_pred_pool, learner.predict(XEXval)) cv_n_iters = np.append(cv_n_iters, n_iters) else: # all CV folds are exhausted. if loss_fn is None: if is_classif: loss = 1 - accuracy_score(cv_y_pool, cv_pred_pool) # -- squared standard error of mean lossvar = (loss * (1 - loss)) / max(1, len(cv_y_pool) - 1) info('OK trial with accuracy %.1f +- %.1f' % (100 * (1 - loss), 100 * np.sqrt(lossvar))) else: loss = 1 - r2_score(cv_y_pool, cv_pred_pool) lossvar = None # variance of R2 is undefined. info('OK trial with R2 score %.2e' % (1 - loss)) else: # Use a user specified loss function loss = loss_fn(cv_y_pool, cv_pred_pool) lossvar = None info('OK trial with loss %.1f' % loss) t_done = time.time() rval = { 'loss': loss, 'loss_variance': lossvar, 'learner': untrained_learner, 'preprocs': preprocessings, 'ex_preprocs': ex_pps_list, 'status': hyperopt.STATUS_OK, 'duration': t_done - t_start, 'iterations': cv_n_iters.max(), } rtype = 'return' # The for loop exit with break, one fold did not finish running. if learner is None: t_done = time.time() rval = { 'status': hyperopt.STATUS_FAIL, 'failure': 'Not enough time to finish training on \ all CV folds', 'duration': t_done - t_start, } rtype = 'return' ##==== Cost function exception handling ====## except (NonFiniteFeature, ) as exc: print('Failing trial due to NaN in', str(exc)) t_done = time.time() rval = { 'status': hyperopt.STATUS_FAIL, 'failure': str(exc), 'duration': t_done - t_start, } rtype = 'return' except (ValueError, ) as exc: if ('k must be less than or equal' ' to the number of training points') in str(exc): t_done = time.time() rval = { 'status': hyperopt.STATUS_FAIL, 'failure': str(exc), 'duration': t_done - t_start, } rtype = 'return' else: rval = exc rtype = 'raise' except (AttributeError, ) as exc: print('Failing due to k_means_ weirdness') if "'NoneType' object has no attribute 'copy'" in str(exc): # -- sklearn/cluster/k_means_.py line 270 raises this sometimes t_done = time.time() rval = { 'status': hyperopt.STATUS_FAIL, 'failure': str(exc), 'duration': t_done - t_start, } rtype = 'return' else: rval = exc rtype = 'raise' except Exception as exc: rval = exc rtype = 'raise' # -- return the result to calling process _conn.send((rtype, rval))
def gridSearchSingleSet(self, parameters, datasetIdx): """ Perform grid search using a single set Goals is to find the optimal classifier parameters :return: """ datasetId = self.datasetIds[datasetIdx] dataset = { 'id': datasetId, 'videoFile': os.path.join(self.projectDirectory, 'videos', datasetId+".MOV"), 'cutouts': { 'posDir': os.path.join(self.projectDirectory, 'cutouts', datasetId, 'pos'), 'negDir': os.path.join(self.projectDirectory, 'cutouts', datasetId, 'neg') }, 'labelsFile': os.path.join(self.projectDirectory, 'labels', datasetId+"_output.txt"), 'framesDir': os.path.join(self.projectDirectory, 'frames', datasetId), } foldsCount = 5 objectIndices = self.getObjectsIndices(dataset['cutouts']['posDir']) foldsIndices = self.getFoldsIndices(objectIndices, foldsCount) negCutoutFiles = glob.glob(dataset['cutouts']['negDir']+"/*") negSamples = [] for fileName in negCutoutFiles: negSamples.append({ 'fileName': fileName, 'label': False, 'features': self.getImageFileFeatures(fileName) }) negSamplesPerFold = len(negSamples)/foldsCount samples = [] foldIdx = 0 for objectIndices in foldsIndices: posCutoutFiles = [] for objectIndex in objectIndices: posCutoutFiles += (glob.glob(dataset['cutouts']['posDir'] + "/cutout_" + str(objectIndex) + "_*.png")) foldPosSamples = [] for fileName in posCutoutFiles: foldPosSamples.append({ 'fileName': fileName, 'label': True, 'features': self.getImageFileFeatures(fileName), 'foldIdx': foldIdx }) foldNegSamples = negSamples[negSamplesPerFold*foldIdx:negSamplesPerFold*foldIdx + negSamplesPerFold] for foldNegSample in foldNegSamples: foldNegSample['foldIdx'] = foldIdx samples.extend(foldPosSamples+foldNegSamples) foldIdx += 1 X = [sample['features'] for sample in samples] y = [sample['label'] for sample in samples] test_fold = [sample['foldIdx'] for sample in samples] ps = PredefinedSplit(test_fold=test_fold) if self.classifierType == 'SVM-RBF': est = svm.SVC() parameters = {'kernel':['rbf'], 'C':parameters['C'], 'gamma': parameters['gamma']} elif self.classifierType == 'KNN': est = KNeighborsClassifier() parameters = {'n_neighbors': parameters['n_neighbors']} elif self.classifierType == 'SVM-LIN': est = svm.SVC() parameters = {'C': parameters['C']} else: raise Exception("Uknown classifier type %s" % self.classifierType) clf = GridSearchCV( estimator=est, param_grid=parameters, n_jobs=1, pre_dispatch='2*n_jobs', iid=False, refit=True, cv=ps ) clf.fit(X, y) print ("=== Scores:") pprint(clf.grid_scores_) print ("=== Best score:") pprint(clf.best_score_) print ("=== Best params:") pprint(clf.best_params_) return clf.best_score_
def gridSearch(self, parameters): """ Perform a grid search using all the available datasets Goals is to find the optimal classifier parameters :return: """ datasets = [] for setIdx, datasetId in enumerate(self.datasetIds): dataset = { 'id': datasetId, 'videoFile': os.path.join(self.projectDirectory, 'videos', datasetId+".MOV"), 'cutouts': { 'posDir': os.path.join(self.projectDirectory, 'cutouts', datasetId, 'pos'), 'negDir': os.path.join(self.projectDirectory, 'cutouts', datasetId, 'neg') }, 'labelsFile': os.path.join(self.projectDirectory, 'labels', datasetId+"_output.txt"), 'framesDir': os.path.join(self.projectDirectory, 'frames', datasetId), } posCutoutFiles = (glob.glob(dataset['cutouts']['posDir'] + "/*.png")) posLabels = [True] * len(posCutoutFiles) negCutoutFiles = (glob.glob(dataset['cutouts']['negDir'] + "/*.png")) negLabels = [False] * len(negCutoutFiles) samples = [] for cutoutFile, label in zip(posCutoutFiles + negCutoutFiles, posLabels + negLabels): samples.append({ 'fileName': cutoutFile, 'label': label, 'features': self.getImageFeatures(cv2.imread(cutoutFile)), 'foldIdx': setIdx }) dataset['samples'] = samples datasets.append(dataset) X = [] y = [] test_fold = [] for dataset in datasets: X.extend([sample['features'] for sample in dataset['samples']]) y.extend([sample['label'] for sample in dataset['samples']]) test_fold.extend([sample['foldIdx'] for sample in dataset['samples']]) ps = PredefinedSplit(test_fold=test_fold) if self.classifierType == 'SVM-RBF': est = svm.SVC() parameters = {'kernel':['rbf'], 'C': parameters['C'], 'gamma': parameters['gamma']} elif self.classifierType == 'KNN': est = KNeighborsClassifier() parameters = {'n_neighbors':parameters['n_neighbors']} elif self.classifierType == 'SVM-LIN': est = svm.SVC() parameters = {'kernel': ['linear'], 'C': parameters['C']} else: raise Exception("Uknown classifier type %s" % self.classifierType) clf = GridSearchCV( estimator=est, param_grid=parameters, n_jobs=1, pre_dispatch='2*n_jobs', iid=False, refit=True, cv=ps ) clf.fit(X, y) print ("=== Scores:") pprint(clf.grid_scores_) print ("=== Best score:") pprint(clf.best_score_) print ("=== Best params:") pprint(clf.best_params_) return clf.best_score_
# increase processing time in a combinatorial way parameters = { 'tfidf__max_df': [0.75, 1.], #过滤了几十个词 'tfidf__min_df': (5, 10, 20, 50), 'tfidf__max_features': (200000, 400000, 600000), 'tfidf__ngram_range': [(1, 3)], # unigrams or trigrams, use trigrams 'tfidf__use_idf': [1], 'tfidf__norm': ('l1', 'l2'), 'clf__alpha': (0.00001, 0.000005, 0.000001), 'clf__penalty': ('l2', 'elasticnet'), #'clf__n_iter': (10, 50, 80), } test_fold = np.zeros((train_docs_num + val_docs_num), dtype='int') test_fold[:train_docs_num] = -1 ps = PredefinedSplit(test_fold=test_fold) t0 = time() my_score = make_scorer(f1_func, greater_is_better=True) grid_search = GridSearchCV(pipeline, parameters, cv=ps, n_jobs=-1, verbose=1, scoring=my_score) print("Performing grid search...") print("pipeline:", [name for name, _ in pipeline.steps]) print("parameters:") pprint(parameters)
def transform(self, X): X = X.set_index('SalesID')[self.columns].sort_index() return X if __name__ == '__main__': df = pd.read_csv('Train.csv') df = df.set_index('SalesID').sort_index() y = df.SalePrice # This is for predefined split... we want -1 for our training split, # 0 for the test split. cv_cutoff_date = pd.to_datetime('2011-01-01') cv = -1 * (pd.to_datetime(df.saledate) < cv_cutoff_date).astype(int) cross_val = PredefinedSplit(cv) p = Pipeline([('filter', FilterColumns()), ('type_change', DataType()), ('replace_outliers', ReplaceOutliers()), ('compute_age', ComputeAge()), ('nearest_average', ComputeNearestMean()), ('columns', ColumnFilter()), ('lm', LinearRegression())]) df = df.reset_index() def rmsle(y_hat, y): target = y predictions = y_hat log_diff = np.log(predictions + 1) - np.log(target + 1) return np.sqrt(np.mean(log_diff**2)) # GridSearch
train_data = pd.read_csv(open('semeval2016-task6-trainingdata-utf-8.txt'), '\t', encoding='utf8', index_col=0) targets = list(train_data.Target.unique()) for target in targets: print 80 * "=" print target print 80 * "=" target_idx = train_data.Target == target target_train_data = train_data[target_idx] target_true_stances = target_train_data.Stance print 'training instances:', len(train_data) print 'target training instances:', len(target_train_data) target_cv = StratifiedKFold(target_true_stances, n_folds=5, shuffle=True, random_state=13) predef_test_fold = -np.ones(len(train_data), dtype='int') predef_test_fold[np.where(target_idx)] = target_cv.test_folds train_cv = PredefinedSplit(predef_test_fold) for train, test in train_cv: print len(train), len(test), len(train) + len(test) print train_data.Target.iloc[test]
def data_init(X_train, X_val, X_test, y_train, y_val, y_test, k): # 归一化 min_max_scaler = MinMaxScaler() min_max_scaler.fit(X_train) X_train = min_max_scaler.transform(X_train) X_val = min_max_scaler.transform(X_val) X_test = min_max_scaler.transform(X_test) # 集成学习分割数据 X_train_1 = X_train[y_train == 1] y_train_1 = y_train[y_train == 1] X_train_0 = X_train[y_train == 0] y_train_0 = y_train[y_train == 0] step_size = X_train_0.shape[0] // k X_train_need = [] y_train_need = [] for i in range(k): tmp_x = X_train_0[i*step_size:min(X_train_0.shape[0]-1, i*step_size+step_size)] tmp_y = y_train_0[i*step_size:min(X_train_0.shape[0]-1, i*step_size+step_size)] X_train_need.append(np.concatenate((X_train_1, tmp_x), axis=0)) y_train_need.append(np.concatenate((y_train_1, tmp_y), axis=0)) C_params = [i/10 for i in range(1, 10)] + [i for i in range(1, 10, 1)] + [i for i in range(10, 100, 10)] cw_params = [i for i in range(1, 10, 1)] + [i for i in range(10, 100, 10)] params = { 'C': C_params, 'class_weight': [{1:w} for w in cw_params], } train_pred_prob_record = [] val_pred_prob_record = [] test_pred_prob_record = [] for i in range(k): print(i) train_val_features = np.concatenate((X_train_need[i], X_val), axis = 0) train_val_labels = np.concatenate((y_train_need[i], y_val), axis = 0) test_fold = np.zeros(train_val_features.shape[0]) test_fold[:X_train_need[i].shape[0]] = -1 ps = PredefinedSplit(test_fold = test_fold) model = GridSearchCV(estimator=LogisticRegression(), param_grid=params, scoring=my_scoring, n_jobs=-1, cv=ps, verbose=0) model.fit(train_val_features, train_val_labels) print(model.best_params_ ) print(model.best_score_ ) train_pr = model.predict_proba(X_train)[:, 1] val_pr = model.predict_proba(X_val)[:, 1] test_pr = model.predict_proba(X_test)[:, 1] utils.model_key_performance(train_pr, y_train) utils.model_key_performance(val_pr, y_val) utils.model_key_performance(test_pr, y_test) train_pred_prob_record.append(train_pr) val_pred_prob_record.append(val_pr) test_pred_prob_record.append(test_pr) train_ensum = np.array(train_pred_prob_record).T val_ensum = np.array(val_pred_prob_record).T test_ensum = np.array(test_pred_prob_record).T return train_ensum, val_ensum, test_ensum