def test_repeated_stratified_kfold_determinstic_split(): X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]] y = [1, 1, 1, 0, 0] random_state = 1944695409 rskf = RepeatedStratifiedKFold( n_splits=2, n_repeats=2, random_state=random_state) # split should produce same and deterministic splits on # each call for _ in range(3): splits = rskf.split(X, y) train, test = next(splits) assert_array_equal(train, [1, 4]) assert_array_equal(test, [0, 2, 3]) train, test = next(splits) assert_array_equal(train, [0, 2, 3]) assert_array_equal(test, [1, 4]) train, test = next(splits) assert_array_equal(train, [2, 3]) assert_array_equal(test, [0, 1, 4]) train, test = next(splits) assert_array_equal(train, [0, 1, 4]) assert_array_equal(test, [2, 3]) assert_raises(StopIteration, next, splits)
def main(dataset_name): dataset = load_dataset() raw_data = np.asarray(dataset['raw']['data']) raw_label = np.asarray(dataset['raw']['label']) num_classes = len(np.unique(raw_label)) rskf = RepeatedStratifiedKFold(n_splits=k_folds, n_repeats=k_fold_reps, random_state=42) print('L2X-Method') cont_seed = 0 nfeats = [] accuracies = [] model_accuracies = [] svc_accuracies = [] fs_time = [] BAs = [] svc_BAs = [] model_BAs = [] mAPs = [] svc_mAPs = [] model_mAPs = [] mus = [] name = dataset_name + '_' + kernel + '_mu_' + str(mu) print(name) for j, (train_index, test_index) in enumerate(rskf.split(raw_data, raw_label)): print('k_fold', j, 'of', k_folds * k_fold_reps) train_data, train_labels = raw_data[train_index], raw_label[ train_index] test_data, test_labels = raw_data[test_index], raw_label[test_index] train_labels = to_categorical(train_labels, num_classes=num_classes) test_labels = to_categorical(test_labels, num_classes=num_classes) valid_features = np.where(np.abs(train_data).sum(axis=0) > 0)[0] if len(valid_features) < train_data.shape[1]: print('Removing', train_data.shape[1] - len(valid_features), 'zero features') train_data = train_data[:, valid_features] test_data = test_data[:, valid_features] model_kwargs = { 'mu': mu / len(train_data), 'kernel': kernel, 'degree': 3 } svc_kwargs = {'C': 1.0, 'solver': 0.} for i, n_features in enumerate([10, 50, 100, 150, 200]): n_accuracies = [] n_svc_accuracies = [] n_model_accuracies = [] n_BAs = [] n_svc_BAs = [] n_model_BAs = [] n_mAPs = [] n_svc_mAPs = [] n_model_mAPs = [] n_train_accuracies = [] n_time = [] print('n_features : ', n_features) heatmaps = [] for r in range(reps): np.random.seed(cont_seed) K.tf.set_random_seed(cont_seed) cont_seed += 1 model = train_Keras( train_data, train_labels, test_data, test_labels, model_kwargs, l2x_model_func=get_l2x_model, n_features=n_features, ) heatmaps.append(model.heatmap) n_time.append(model.fs_time) test_data_norm = model.normalization.transform(test_data) train_data_norm = model.normalization.transform(train_data) test_pred = model.predict(test_data_norm) n_model_accuracies.append( model.evaluate(test_data_norm, test_labels, verbose=0)[-1]) n_model_BAs.append(balance_accuracy(test_labels, test_pred)) n_model_mAPs.append( average_precision_score(test_labels[:, -1], test_pred)) train_acc = model.evaluate(train_data_norm, train_labels, verbose=0)[-1] print('n_features : ', n_features, ', accuracy : ', n_model_accuracies[-1], ', BA : ', n_model_BAs[-1], ', mAP : ', n_model_mAPs[-1], ', train_accuracy : ', train_acc, ', time : ', n_time[-1], 's') del model K.clear_session() heatmap = np.mean(heatmaps, axis=0) best_features = np.argsort(heatmap)[::-1][:n_features] svc_train_data = train_data[:, best_features] svc_test_data = test_data[:, best_features] norm = normalization_func() svc_train_data_norm = norm.fit_transform(svc_train_data) svc_test_data_norm = norm.transform(svc_test_data) bestcv = -1 bestc = None bestSolver = None for s in [0, 1, 2, 3]: for my_c in [ 0.001, 0.1, 0.5, 1.0, 1.4, 1.5, 1.6, 2.0, 2.5, 5.0, 100.0 ]: cmd = '-v 5 -s ' + str(s) + ' -c ' + str(my_c) + ' -q' cv = liblinearutil.train( (2 * train_labels[:, -1] - 1).tolist(), svc_train_data_norm.tolist(), cmd) if cv > bestcv: # print('Best -> C:', my_c, ', s:', s, ', acc:', cv) bestcv = cv bestc = my_c bestSolver = s svc_kwargs['C'] = bestc svc_kwargs['solver'] = bestSolver print('Best -> C:', bestc, ', s:', bestSolver, ', acc:', bestcv) for r in range(reps): np.random.seed(cont_seed) K.tf.set_random_seed(cont_seed) cont_seed += 1 model = train_SVC(svc_train_data_norm, train_labels, svc_kwargs) _, accuracy, test_pred = liblinearutil.predict( (2 * test_labels[:, -1] - 1).tolist(), svc_test_data_norm.tolist(), model, '-q') test_pred = np.asarray(test_pred) n_svc_accuracies.append(accuracy[0]) n_svc_BAs.append(balance_accuracy(test_labels, test_pred)) n_svc_mAPs.append( average_precision_score(test_labels[:, -1], test_pred)) del model model = train_Keras(svc_train_data, train_labels, svc_test_data, test_labels, model_kwargs) train_data_norm = model.normalization.transform(svc_train_data) test_data_norm = model.normalization.transform(svc_test_data) test_pred = model.predict(test_data_norm) n_BAs.append(balance_accuracy(test_labels, test_pred)) n_mAPs.append( average_precision_score(test_labels[:, -1], test_pred)) n_accuracies.append( model.evaluate(test_data_norm, test_labels, verbose=0)[-1]) n_train_accuracies.append( model.evaluate(train_data_norm, train_labels, verbose=0)[-1]) del model K.clear_session() print( 'n_features : ', n_features, ', acc : ', n_accuracies[-1], ', BA : ', n_BAs[-1], ', mAP : ', n_mAPs[-1], ', train_acc : ', n_train_accuracies[-1], ', svc_acc : ', n_svc_accuracies[-1], ', svc_BA : ', n_svc_BAs[-1], ', svc_mAP : ', n_svc_mAPs[-1], ) if i >= len(accuracies): accuracies.append(n_accuracies) svc_accuracies.append(n_svc_accuracies) model_accuracies.append(n_model_accuracies) BAs.append(n_BAs) mAPs.append(n_mAPs) fs_time.append(n_time) svc_BAs.append(n_svc_BAs) svc_mAPs.append(n_svc_mAPs) model_BAs.append(n_model_BAs) model_mAPs.append(n_model_mAPs) nfeats.append(n_features) mus.append(model_kwargs['mu']) else: accuracies[i] += n_accuracies svc_accuracies[i] += n_svc_accuracies model_accuracies[i] += n_model_accuracies fs_time[i] += n_time BAs[i] += n_BAs mAPs[i] += n_mAPs svc_BAs[i] += n_svc_BAs svc_mAPs[i] += n_svc_mAPs model_BAs[i] += n_model_BAs model_mAPs[i] += n_model_mAPs output_filename = directory + 'LinearSVC_' + kernel + '_L2X.json' if not os.path.isdir(directory): os.makedirs(directory) info_data = { 'kernel': kernel, 'reps': reps, 'classification': { 'mus': mus, 'n_features': nfeats, 'accuracy': accuracies, 'mean_accuracy': np.array(accuracies).mean(axis=1).tolist(), 'svc_accuracy': svc_accuracies, 'mean_svc_accuracy': np.array(svc_accuracies).mean(axis=1).tolist(), 'model_accuracy': model_accuracies, 'mean_model_accuracy': np.array(model_accuracies).mean(axis=1).tolist(), 'BA': BAs, 'mean_BA': np.array(BAs).mean(axis=1).tolist(), 'mAP': mAPs, 'mean_mAP': np.array(mAPs).mean(axis=1).tolist(), 'svc_BA': svc_BAs, 'svc_mean_BA': np.array(svc_BAs).mean(axis=1).tolist(), 'svc_mAP': svc_mAPs, 'svc_mean_mAP': np.array(svc_mAPs).mean(axis=1).tolist(), 'model_BA': model_BAs, 'model_mean_BA': np.array(model_BAs).mean(axis=1).tolist(), 'model_mAP': model_mAPs, 'model_mean_mAP': np.array(model_mAPs).mean(axis=1).tolist(), 'fs_time': fs_time } } for k, v in info_data['classification'].items(): if 'mean' in k: print(k, v) with open(output_filename, 'w') as outfile: json.dump(info_data, outfile)
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) else: training_transform, test_transform = 2 * [ transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) ] if args.dataset == 'cars': from data import cars stratified_crossvalidation = RepeatedStratifiedKFold( n_splits=3, n_repeats=5, random_state=args.seed) data = cars.Calltech101(images_folder='images', input_transform=training_transform) data.shuffle(seed=args.seed) data_size = len(data) indexes = np.arange(0, data_size, 1, dtype=int) targets = np.asarray(data.targets) kfolds = list(stratified_crossvalidation.split(indexes, targets)) train_dataset = cars.Calltech101(images_folder='images', input_transform=training_transform) train_dataset.shuffle(seed=args.seed) train_dataset.prune_dataset(indexes=kfolds[args.cross_validation_split][1]) test_dataset = cars.Calltech101(images_folder='images', input_transform=test_transform) train_dataset.shuffle(seed=args.seed) test_dataset.prune_dataset(indexes=kfolds[args.cross_validation_split][0])
def generate_kfold(X, y=None, n_splits=5, random_state=0, stratified=False, n_repeats=1): if stratified and (y is not None): if n_repeats > 1: kf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state) else: kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state) kf.get_n_splits(X, y) return [[train_index, test_index] for train_index, test_index in kf.split(X, y)] else: if n_repeats > 1: kf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state) else: kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state) kf.get_n_splits(X) return [[train_index, test_index] for train_index, test_index in kf.split(X)]
def rfe_feature_selection(self, input, output, dict_of_models, list_number_of_features_to_select): """ Performs models evaluation within the No_outer times repeated No_inner-fold cross-validation procedure for different number of features selected by RFE algorithm with nested 10-times cross-validation for model hyperparameters' tuning ---------- :param input : array-like, shape (n_samples, n_features) The training input samples. :param output : array-like, shape (n_samples, 1) The target values. :param dict_of_models: dictionary Models with details for grid-search. :param list_number_of_features_to_select - list Number of features to select. :return df_aucs : DataFrame object, shape (No_outer x No_inner, number of models x length of list_number_of_features) AUC values for every step of No_outer x No_inner-times CV are provided. :return df_res : DataFrame object, shape ([number of models x length of list_number_of_features_to_select], 9) For every model and every No. of selected features best classifier's parameters and averaged classification metrics are provided : Accuracy, Sensitivity, Specificity, Precision, F1-Score, AUC. :return df_stds : DataFrame object, shape ([number of models x length of list_number_of_features_to_select], 8) For every model and every No. of selected features standard deviations of classification metrics are provided. """ df_res = pd.DataFrame(columns=[ 'FS Method', 'Classifier', 'Selected features', 'Best parameters', 'Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'F1-score', 'ROC_AUC' ]) df_stds = pd.DataFrame(columns=[ 'FS Method', 'Classifier', 'Selected features', 'Acc_std', 'Sens_std', 'Spec_std', 'Prec_std', 'F1_std', 'ROC_AUC_std' ]) df_aucs = pd.DataFrame() for m in dict_of_models: for k in list_number_of_features_to_select: accuracy = [] aucs = [] sensitivity = [] specificity = [] precision = [] f1score = [] cohen_kappa = [] tprs = [] params = [] X, y = input, output skf = RepeatedStratifiedKFold(n_splits=self.N_inner, n_repeats=self.N_outer, random_state=88) clf = m['classifier'] for train_index, test_index in skf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] best_params = [] rfe = RFE(estimator=clf, n_features_to_select=k, step=1) rfe_smote_clf = Pipeline([('oversampling', SMOTE(random_state=88)), ('feature_selection', rfe), ('classifier', clf)]) param_grid = m['grid'] gridsearch_cv = GridSearchCV(rfe_smote_clf, param_grid, cv=10, scoring='roc_auc') gridsearch_cv.fit(X_train, y_train) best_params.append(gridsearch_cv.best_params_) # predicted class y_predict = gridsearch_cv.predict(X_test) # predicted probabilities probas_ = gridsearch_cv.predict_proba(X_test) # accuracy acc = accuracy_score(y_predict, y_test) accuracy.append(acc) # sensitivity = recall sens = recall_score(y_test, y_predict) sensitivity.append(sens) # specificity spec = self.get_specificity(y_test, y_predict) specificity.append(spec) # precision prec = precision_score(y_test, y_predict) precision.append(prec) # f1-score f1 = f1_score(y_test, y_predict) f1score.append(f1) # cohen-kappa-score kappa = cohen_kappa_score(y_test, y_predict) cohen_kappa.append(kappa) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[test_index], probas_[:, 1]) tprs.append(interp(self.mean_fprs, fpr, tpr)) tprs[-1][0] = 0.0 roc_auc = auc(fpr, tpr) aucs.append(roc_auc) # best parameters params.append(best_params) df_aucs[m['name'] + str(k)] = aucs df_stds = df_stds.append( { 'Classifier': m['name'], 'Selected features': k, 'Acc_std': np.std(accuracy), 'Sens_std': np.std(sensitivity), 'Spec_std': np.std(specificity), 'Prec_std': np.std(precision), 'F1_std': np.std(f1score), 'ROC_AUC_std': np.std(aucs) }, ignore_index=True) df_res = df_res.append( { 'Classifier': m['name'], 'Selected features': k, 'Best parameters': params, 'Accuracy': np.mean(accuracy), 'Sensitivity': np.mean(sensitivity), 'Specificity': np.mean(specificity), 'Precision': np.mean(precision), 'F1-score': np.mean(f1score), 'ROC_AUC': np.mean(aucs) }, ignore_index=True) return df_aucs, df_res, df_stds
def best_models_ROC_curves(self, input, output, models_dict, show_plot): """ Plot ROC curves for the best evaluated methods or returns averaged predicted probabilities for every sample by all selected models Parameters ---------- :param input : array-like, shape (n_samples, n_features) The training input samples. :param output : array-like, shape (n_samples, 1) The target values. :param models_dict : dictionary Models with details for grid-search :param show_plot - boolean Indicator whether plot should be rendered :return [selected_models]_proba_[number_of_selected_features] : array-like, shape (1, n_samples) Averaged predicted probabilities for every sample by every selected model :return plot """ X, y = input, output mean_fpr = np.linspace(0, 1, 100) instances = X.shape[0] proba = np.zeros((len(models_dict), 10, instances)) skf = RepeatedStratifiedKFold(n_splits=self.N_inner, n_repeats=self.N_outer, random_state=88) plt.figure(1) plt.rcParams["figure.figsize"] = (10, 6) plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='brown', label='Chance', alpha=.8) j = 0 for m in models_dict: tprs = [] aucs = [] clf = m['classifier'] fs = m['fs_method'] color_line = m['color_line'] color_shadow = m['color_shadow'] k = 1 i = 0 for train_index, test_index in skf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] fs_smote_clf = Pipeline([('oversampling', SMOTE(random_state=88)), ('feature_selection', fs), ('classifier', clf)]) param_grid = m['grid'] # for survival problem if m['name'] == 'MLP_mRMR' or m['name'] == 'SVM_mRMR_50': gridsearch_cv = fs_smote_clf else: gridsearch_cv = GridSearchCV(fs_smote_clf, param_grid, cv=10, scoring='roc_auc') gridsearch_cv.fit(X_train, y_train) # predicted probabilities probas_ = gridsearch_cv.predict_proba(X_test) if k > self.N_outer: break elif i >= k * self.N_inner: k += 1 proba[j, k - 1, test_index] = probas_[:, 1] # Compute ROC curve fpr, tpr, thresholds = roc_curve(y[test_index], probas_[:, 1]) tprs.append(interp(self.mean_fprs, fpr, tpr)) tprs[-1][0] = 0.0 roc_auc = auc(fpr, tpr) aucs.append(roc_auc) i += 1 mean_tpr = np.mean(tprs, axis=0) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) std_auc = np.std(aucs) plt.plot(mean_fpr, mean_tpr, color=color_line, label=r'Mean ROC %s (AUC = %0.2f $\pm$ %0.2f)' % (m['name'], mean_auc, std_auc), lw=2, alpha=.8) std_tpr = np.std(tprs, axis=0) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) tprs_lower = np.maximum(mean_tpr - std_tpr, 0) plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color=color_shadow, alpha=.2, label=r'$\pm$ 1 std. dev. for ' + m['name']) j += 1 if show_plot: plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.show() return else: mean_proba = np.mean(proba, axis=1) return mean_proba
def cross_val( self, X=None, y=None, X_test=None, model=None, folds=10, score_folds=5, n_repeats=2, print_metric=False, metric_round=4, predict=False, get_feature_importance=False, ): """ Description of cross_val: Cross-validation function Args: X=None (undefined): y=None (undefined): X_test=None (undefined): model=None (undefined): folds=10 (undefined): score_folds=5 (undefined): n_repeats=2 (undefined): print_metric=False (undefined): metric_round=4 (undefined): predict=False (undefined): get_feature_importance=False (undefined): Returns: result (dict) """ if model is None: model = self if X is None: X = model._data.X_train if y is None: y = model._data.y_train if X_test is None: X_test = model._data.X_test if predict and (X_test is None): raise Exception("No X_test for predict") if model.type_of_estimator == 'classifier': skf = RepeatedStratifiedKFold( n_splits=folds, n_repeats=n_repeats, random_state=model._random_state, ) else: skf = RepeatedKFold( n_splits=folds, n_repeats=n_repeats, random_state=model._random_state, ) folds_scores = [] stacking_y_pred_train = np.zeros(len(X)) stacking_y_pred_test = np.zeros(len(X_test)) feature_importance_df = pd.DataFrame(np.zeros(len(X.columns)), index=X.columns) for i, (train_idx, valid_idx) in enumerate(skf.split(X, y)): train_x, train_y = X.iloc[train_idx], y.iloc[train_idx] val_x, val_y = X.iloc[valid_idx], y.iloc[valid_idx] # TargetEncoders train_x, val_x, X_test = model.preproc_data_in_cv( train_x, train_y, val_x, X_test) # Fit model._fit( model=model, X_train=train_x.reset_index(drop=True), y_train=train_y.reset_index(drop=True), X_test=val_x.reset_index(drop=True), y_test=val_y.reset_index(drop=True), ) # Predict if (model.metric.__name__ in predict_proba_metrics) and ( model.is_possible_predict_proba()): y_pred = model._predict_proba(val_x) if predict: y_pred_test = model._predict_proba(X_test) else: y_pred = model._predict(val_x) if predict: y_pred_test = model._predict(X_test) score_model = model.metric(val_y, y_pred) folds_scores.append(score_model) if get_feature_importance: feature_importance_df += model._get_feature_importance(train_x) if predict: stacking_y_pred_train[valid_idx] += y_pred stacking_y_pred_test += y_pred_test else: # score_folds if i + 1 >= score_folds: break if predict: stacking_y_pred_train = stacking_y_pred_train / n_repeats stacking_y_pred_test = stacking_y_pred_test / (folds * n_repeats) if score_folds > 1 or predict: score = round(np.mean(folds_scores), metric_round) score_std = round(np.std(folds_scores), metric_round + 2) else: score = round(score_model, metric_round) score_std = 0 if print_metric: print( f'\n Mean Score {model.metric.__name__} on {i+1} Folds: {score} std: {score_std}' ) # Total result = { 'Score': score, 'Score_Std': score_std, 'Test_predict': stacking_y_pred_test, 'Train_predict': stacking_y_pred_train, 'Feature_importance': dict(feature_importance_df[0]), } return (result)
def cross_validate(feature_name, classifier_name, X, y, cv_num_folds, cv_num_repeats): """Runs repeated stratified $k$-fold cross-validation. Returns multiple cross-validation metrics as a dictionary, where for each metric mean and variance across multiple repeats and folds is summarized. Args: feature_name: (string) Name of the WALS feature. classifier_name: (string) Classifier name. X: (numpy array) Input features. y: (numpy array) Labels. cv_num_folds: (int) Number of folds ($k$). cv_num_repeats: (int) Number of repetitions. Returns: Dictionary containing cross-validation scores and stats. """ model = _make_classifier(classifier_name) scoring = ["f1_micro", "precision_micro", "recall_micro", "accuracy"] try: # Really primitive logic to figure out class distribution. _, y_counts = np.unique(y, return_counts=True) y_max_freq = np.max(y_counts) # Check if the class counts are not reliable to run cross-validation. if y_max_freq < cv_num_folds: logging.warning( "[%s] %s: Not enough data. Fitting the model instead " "of running CV", feature_name, classifier_name) # Simply fit the model. model.fit(X, y) cv_scores = {} cv_scores["accuracy"] = (model.score(X, y), 0.0) cv_scores[MODEL_INFO_SPARSITY_KEY] = True return cv_scores else: logging.info( "[%s] Running cross-validation of %s (k=%d, n=%d) ...", feature_name, classifier_name, cv_num_folds, cv_num_repeats) # Run cross-validation. cv = RepeatedStratifiedKFold(n_splits=cv_num_folds, n_repeats=cv_num_repeats, random_state=_RANDOM_STATE) cv_scores = model_selection.cross_validate(model, X, y, cv=cv, scoring=scoring, n_jobs=cv_num_folds) cv_scores[MODEL_INFO_SPARSITY_KEY] = False except Exception as e: # pylint: disable=broad-except logging.error("[%s] %s: CV: Exception: %s", feature_name, classifier_name, e) return None del cv_scores["fit_time"] del cv_scores["score_time"] for score_name in scoring: scores_vec_key = "test_" + score_name cv_scores[score_name] = (np.mean(cv_scores[scores_vec_key]), np.var(cv_scores[scores_vec_key])) del cv_scores[scores_vec_key] # Sanity check. if math.isnan(cv_scores["accuracy"][0]): return None logging.info("[train] %s: CV scores for %s: %s", feature_name, classifier_name, cv_scores) return cv_scores
'use_particle_clamp_each_iteration': False, 'unchanged_iterations_stop': 20000, 'use_only_early_stopping': False # no early stopping used, that is why 20k }, 'pso_velocity_clamp': (-1, 1), 'n_particles': 100, 'pso_iters': 5000, 'pso_optimizer': PSO, } CONFIG['cv'] = StratifiedKFold(n_splits=CONFIG['n_splits'], shuffle=True, random_state=CONFIG['random_state']) \ if CONFIG['n_repeats'] == 1 else RepeatedStratifiedKFold(n_splits=CONFIG['n_splits'], n_repeats=CONFIG['n_repeats'], random_state=CONFIG['random_state']) INPUT_FEATURES = torch.load(CONFIG['labels_features_common_name'] + "_features.tr").numpy() INPUT_LABELS = torch.load(CONFIG['labels_features_common_name'] + "_labels.tr").numpy() make_experiment_reproducible(CONFIG['random_state']) def run_cross_validation_psobp(file_to_print) -> torch.Tensor: logger = logging.getLogger('10_fold_cv') configure_logger_by_default(logger) logger.info("START run_cross_validation") def print_info(info): logger.info(info)
np.set_printoptions(suppress=True) # Pobieramy zestaw danych digits = load_digits() # Obrazy images = digits.images # Etykiety y = digits.target # Spłaszczenie obrazów do wektora X = images.reshape((images.shape[0], -1)) print(X.shape) #Podział na zbiór treningowy i testowy stratyfikowany na 5 foldów kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1410) results = [] # iterowanie po zbiorach treningowych i testowych po podzieleniu na foldy for train_index, test_index in kf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #ekstrakcja cech PCA pca = PCA(n_components=X_train.shape[1], random_state=1410) pca.fit(X_train) # procent objasnioniej wariacji evr = pca.explained_variance_ratio_ evr_acc = np.add.accumulate(evr) print(evr_acc)
def CueDesc_SegDecAnalysis(dat): nPe = 100 nRepeats = 10 nSh = 50 njobs = 20 trConds = dat['TrialConds'] trDat = dat['TrialLongMat'] nUnits = dat['fitTable2'].shape[0] gTrialsIDs = trConds['Good'] Trials = trConds[gTrialsIDs].index.values nTrials = len(Trials) allZoneFR,unitIDs = reformatFRDat(dat,Trials) CoTrials = trConds[gTrialsIDs & (trConds['Co']=='Co')].index.values InCoTrials = trConds[gTrialsIDs & (trConds['Co']=='InCo')].index.values nInCo = len(InCoTrials) TrSets = {} TrSets['all'] = np.arange(nTrials) _,idx,_=np.intersect1d(np.array(Trials),np.array(CoTrials),return_indices=True) TrSets['co'] = idx _,idx,_=np.intersect1d(np.array(Trials),np.array(InCoTrials),return_indices=True) TrSets['inco'] = idx cueVec = trConds.loc[gTrialsIDs]['Cues'].values descVec = trConds.loc[gTrialsIDs]['Desc'].values predVec = {'Cue':cueVec, 'Desc':descVec} nFeatures = {'h':np.arange(1),'a':np.arange(2),'center':np.arange(3),'be':np.arange(4),'int':np.arange(5),'cdfg':np.arange(6),'goal':np.arange(7)} def correctTrials_Decoder(train,test): res = pd.DataFrame(np.zeros((3,4)),columns=['Test','BAc','P','Z']) temp = mod.fit(X_train[train],y_train[train]) res.loc[0,'Test'] = 'Model' y_hat = temp.predict(X_train[test]) res.loc[0,'BAc'] = bac(y_train[test],y_hat)*100 # shuffle for held out train set mod_sh = np.zeros(nSh) for sh in np.arange(nSh): y_perm_hat = np.random.permutation(y_hat) mod_sh[sh] = bac(y_train[test],y_perm_hat)*100 res.loc[0,'Z'] = getPerm_Z(mod_sh, res.loc[0,'BAc'] ) res.loc[0,'P'] = getPerm_Pval(mod_sh, res.loc[0,'BAc'] ) # predictions on x test y_hat = temp.predict(X_test) res.loc[1,'Test'] = 'Cue' res.loc[1,'BAc'] = bac(y_test_cue,y_hat)*100 res.loc[2,'Test'] = 'Desc' res.loc[2,'BAc'] = bac(y_test_desc,y_hat)*100 # shuffles for ytest cue/desc cue_sh = np.zeros(nSh) desc_sh = np.zeros(nSh) for sh in np.arange(nSh): y_perm_hat = np.random.permutation(y_hat) cue_sh[sh] = bac(y_test_cue,y_perm_hat)*100 desc_sh[sh] = bac(y_test_desc,y_perm_hat)*100 res.loc[1,'Z'] = getPerm_Z(cue_sh, res.loc[1,'BAc'] ) res.loc[1,'P'] = getPerm_Pval(cue_sh, res.loc[1,'BAc'] ) res.loc[2,'Z'] = getPerm_Z(desc_sh, res.loc[2,'BAc'] ) res.loc[2,'P'] = getPerm_Pval(desc_sh, res.loc[2,'BAc'] ) res['nSeUnits'] = nUnits return res def balancedCoIncoTrial_Decoder(pe,feats): res = pd.DataFrame(np.zeros((2,4)),columns=['Test','BAc','P','Z']) # sample correct trials to match the number of incorrect trials. samp_co_trials = np.random.choice(TrSets['co'],nInCo,replace=False) train = np.concatenate( (TrSets['inco'], samp_co_trials )) test = np.setdiff1d(TrSets['co'], samp_co_trials) X_train = allZoneFR.loc[train,feats].values X_test = allZoneFR.loc[test,feats].values Y_cue_train = predVec['Cue'][train] Y_desc_train = predVec['Desc'][train] Y_test = predVec['Cue'][test] # cue and desc trials are the on the test set. # model trained on the cue res.loc[0,'Test'] = 'Cue' cue_mod = mod.fit(X_train,Y_cue_train) y_cue_hat = cue_mod.predict(X_test) res.loc[0,'BAc'] = bac(Y_test,y_cue_hat)*100 cue_sh = np.zeros(nSh) for sh in np.arange(nSh): y_perm = np.random.permutation(Y_test) cue_sh[sh] = bac(y_perm,y_cue_hat)*100 res.loc[0,'Z'] = getPerm_Z(cue_sh, res.loc[0,'BAc'] ) res.loc[0,'P'] = getPerm_Pval(cue_sh, res.loc[0,'BAc'] ) # model trained on the desc res.loc[1,'Test'] = 'Desc' desc_mod = mod.fit(X_train,Y_desc_train) y_desc_hat = desc_mod.predict(X_test) res.loc[1,'BAc'] = bac(Y_test,y_desc_hat)*100 desc_sh = np.zeros(nSh) for sh in np.arange(nSh): y_perm = np.random.permutation(Y_test) desc_sh[sh] = bac(y_perm,y_desc_hat)*100 res.loc[1,'Z'] = getPerm_Z(cue_sh, res.loc[1,'BAc'] ) res.loc[1,'P'] = getPerm_Pval(cue_sh, res.loc[1,'BAc'] ) return res def IncoTrial_Decoder(train,test): res = pd.DataFrame(np.zeros((3,4)),columns=['Test','BAc','P','Z']) temp = mod.fit(X_train[train],y_train[train]) res.loc[0,'Test'] = 'Model' y_hat = temp.predict(X_train[test]) res.loc[0,'BAc'] = bac(y_train[test],y_hat)*100 # shuffle for held out train set mod_sh = np.zeros(nSh) for sh in np.arange(nSh): y_perm_hat = np.random.permutation(y_hat) mod_sh[sh] = bac(y_train[test],y_perm_hat)*100 res.loc[0,'Z'] = getPerm_Z(mod_sh, res.loc[0,'BAc'] ) res.loc[0,'P'] = getPerm_Pval(mod_sh, res.loc[0,'BAc'] ) # predictions on x test y_hat = temp.predict(X_test) res.loc[1,'Test'] = 'Cue' res.loc[1,'BAc'] = bac(y_test_cue,y_hat)*100 res.loc[2,'Test'] = 'Desc' res.loc[2,'BAc'] = 100-res.loc[1,'BAc'] # shuffles for ytest cue/desc cue_sh = np.zeros(nSh) for sh in np.arange(nSh): y_perm_hat = np.random.permutation(y_hat) cue_sh[sh] = bac(y_test_cue,y_perm_hat)*100 res.loc[1,'Z'] = getPerm_Z(cue_sh, res.loc[1,'BAc'] ) res.loc[1,'P'] = getPerm_Pval(cue_sh, res.loc[1,'BAc'] ) res.loc[2,'Z'] = getPerm_Z(100-cue_sh, res.loc[2,'BAc'] ) res.loc[2,'P'] = getPerm_Pval(100-cue_sh, res.loc[2,'BAc'] ) return res with Parallel(n_jobs=njobs) as parallel: # correct trials Model: coModsDec = pd.DataFrame() popCoModsDec = pd.DataFrame() try: nFolds = 10 y_train = predVec['Cue'][TrSets['co']] y_test_cue = predVec['Cue'][TrSets['inco']] y_test_desc = predVec['Desc'][TrSets['inco']] rskf = RepeatedStratifiedKFold(n_splits=nFolds,n_repeats=nRepeats, random_state=0) t0=time.time() for unitNum in np.arange(nUnits): for p,nF in nFeatures.items(): feats = unitIDs[unitNum][nF] mod = lm.LogisticRegression(class_weight='balanced',C=1/np.sqrt(len(feats))) X_train = allZoneFR.loc[TrSets['co'], feats ].values X_test = allZoneFR.loc[TrSets['inco'], feats ].values cnt=0 r = parallel(delayed(correctTrials_Decoder)(train,test) for train,test in rskf.split(X_train,y_train)) t1=time.time() res = pd.DataFrame() for jj in r: res = pd.concat((jj,res)) res['Loc'] = p res['-log(P)'] = -np.log(res['P']) res['unit'] = unitNum coModsDec = pd.concat((coModsDec,res)) print(end='.') coModsDec['Decoder'] = 'Correct' # -population for p,nF in nFeatures.items(): feats=np.array([]) for f in nF: feats=np.concatenate((feats,np.arange(f,nUnits*7,7))) feats=feats.astype(int) mod = lm.LogisticRegression(class_weight='balanced',C=1/np.sqrt(len(feats))) X_train = allZoneFR.loc[TrSets['co'], feats ].values X_test = allZoneFR.loc[TrSets['inco'], feats ].values cnt=0 r = parallel(delayed(correctTrials_Decoder)(train,test) for train,test in rskf.split(X_train,y_train)) res = pd.DataFrame() for jj in r: res = pd.concat((jj,res)) res['Loc'] = p res['-log(P)'] = -np.log(res['P']) popCoModsDec = pd.concat((popCoModsDec,res)) print(end='.') print('\nDecoding Correct Model Completed. Time = {0:.2f}s \n'.format(time.time()-t0)) popCoModsDec['Decoder'] = 'Correct' except: print('CorrectTrials Model Failed.') print ("Error", sys.exc_info()[0],sys.exc_info()[1],sys.exc_info()[2].tb_lineno) # balanced correct/inco model: baModsDec = pd.DataFrame() popBaModsDec = pd.DataFrame() try: t0=time.time() for unitNum in np.arange(nUnits): for p,nF in nFeatures.items(): feats = unitIDs[unitNum][nF] mod = lm.LogisticRegression(class_weight='balanced',C=1/np.sqrt(len(feats))) r = parallel(delayed(balancedCoIncoTrial_Decoder)(pe, feats) for pe in np.arange(nPe)) res = pd.DataFrame() for jj in r: res = pd.concat((jj,res)) res['Loc'] = p res['-log(P)'] = -np.log(res['P']) res['unit'] = unitNum baModsDec = pd.concat((baModsDec,res)) print(end='.') baModsDec['Decoder'] = 'Balanced' # -population for p,nF in nFeatures.items(): feats=np.array([]) for f in nF: feats=np.concatenate((feats,np.arange(f,nUnits*7,7))) feats=feats.astype(int) mod = lm.LogisticRegression(class_weight='balanced',C=1/np.sqrt(len(feats))) r = parallel(delayed(balancedCoIncoTrial_Decoder)(pe, feats) for pe in np.arange(nPe)) res = pd.DataFrame() for jj in r: res = pd.concat((jj,res)) res['Loc'] = p res['-log(P)'] = -np.log(res['P']) popBaModsDec = pd.concat((popBaModsDec,res)) print(end='.') print('\nDecoding Balanced Model Completed. Time = {0:.2f}s \n'.format(time.time()-t0)) popBaModsDec['Decoder'] = 'Balanced' except: print('Balanced Model Failed.') print ("Error", sys.exc_info()[0],sys.exc_info()[1],sys.exc_info()[2].tb_lineno) # incorrect trials model: InCoModsDec = pd.DataFrame() popInCoModsDec = pd.DataFrame() try: t0=time.time() nFolds = 5 y_train = predVec['Cue'][TrSets['inco']] y_test_cue = predVec['Cue'][TrSets['co']] y_test_desc = predVec['Desc'][TrSets['co']] rskf = RepeatedStratifiedKFold(n_splits=nFolds,n_repeats=nRepeats, random_state=0) for unitNum in np.arange(nUnits): for p,nF in nFeatures.items(): feats = unitIDs[unitNum][nF] mod = lm.LogisticRegression(class_weight='balanced',C=1/np.sqrt(len(feats))) X_train = allZoneFR.loc[TrSets['inco'], feats ].values X_test = allZoneFR.loc[TrSets['co'], feats ].values cnt=0 r = parallel(delayed(IncoTrial_Decoder)(train,test) for train,test in rskf.split(X_train,y_train)) res = pd.DataFrame() for jj in r: res = pd.concat((jj,res)) res['Loc'] = p res['-log(P)'] = -np.log(res['P']) res['unit'] = unitNum InCoModsDec = pd.concat((InCoModsDec,res)) print(end='.') InCoModsDec['Decoder'] = 'Incorrect' #-population for p,nF in nFeatures.items(): feats=np.array([]) for f in nF: feats=np.concatenate((feats,np.arange(f,nUnits*7,7))) feats=feats.astype(int) mod = lm.LogisticRegression(class_weight='balanced',C=1/np.sqrt(len(feats))) X_train = allZoneFR.loc[TrSets['inco'], feats ].values X_test = allZoneFR.loc[TrSets['co'], feats ].values cnt=0 r = parallel(delayed(IncoTrial_Decoder)(train,test) for train,test in rskf.split(X_train,y_train)) res = pd.DataFrame() for jj in r: res = pd.concat((jj,res)) res['Loc'] = p res['-log(P)'] = -np.log(res['P']) popInCoModsDec = pd.concat((popInCoModsDec,res)) print(end='.') print('\nDecoding Incorrect Model Completed. Time = {0:.2f}s \n'.format(time.time()-t0)) popInCoModsDec['Decoder'] = 'Incorrect' except: print('Incorrect Model Failed.') print ("Error", sys.exc_info()[0],sys.exc_info()[1],sys.exc_info()[2].tb_lineno) # group results. singCellDec = pd.concat((coModsDec,baModsDec,InCoModsDec)) popDec = pd.concat((popCoModsDec,popBaModsDec,popInCoModsDec)) singCellDecSummary = singCellDec.groupby(['Loc','Test','unit','Decoder']).mean() singCellDecSummary = singCellDecSummary.reset_index() singCellDecSummary['Test'] = pd.Categorical(singCellDecSummary['Test'],categories=['Model','Cue','Desc'],ordered=True) singCellDecSummary.sort_values('Test',inplace=True) singCellDecSummary['Loc'] = pd.Categorical(singCellDecSummary['Loc'],categories=nFeatures.keys(),ordered=True) singCellDecSummary.sort_values('Loc',inplace=True) return singCellDec,singCellDecSummary, popDec
# Creamos un ColumnTransformer para el StandardScaler scaler = ColumnTransformer([('scaler_media', scaler_media, slice(0, 8)), ('scaler_moda', scaler_moda, slice(8, len(X.columns)))]) # Creamos el Pipeline incorporando ColumnTransformer y Clasificador pipeline = Pipeline([('imputer', imputer), ('scaler', scaler), ('svm', SVC(random_state=RANDOM_STATE, class_weight=CLASS_WEIGHT, probability=True))]) # InnerCV (GridSearchCV de 2-folds 5-times (stratified) para obtener mejores parámetros) rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=5, random_state=RANDOM_STATE) # inner grid_search = GridSearchCV(estimator=pipeline, param_grid=PARAM_GRID, scoring=SCORING, cv=rskf) # # OuterCV (Validación cruzada de 5 folds (stratified) para estimar Accuracy) # scores = cross_validate(estimator=grid_search, X=X, y=y, cv=5, error_score='raise', return_estimator=True, scoring=SCORING) # outer # print('Scores: {}' .format(scores['test_score'])) # print('Mean score: {}' .format(np.mean(scores['test_score']))) # # Creamos clasificador 'tonto' y obtenemos resultados también con validación cruzada (CV=5) para tener resultados más realistas # dummy_clf = DummyClassifier(strategy='most_frequent', random_state=RANDOM_STATE) # dummy_scores = cross_validate(estimator=dummy_clf, X=X, y=y, cv=5, error_score='raise', return_estimator=True, scoring=SCORING) # print('Dummy scores: {}' .format(dummy_scores['test_score']))
def launch(self) -> int: """Execute the :class:`Resampling <resampling.resampling.Resampling>` resampling.resampling.Resampling object.""" # check input/output paths and parameters self.check_data_params(self.out_log, self.err_log) # Setup Biobb if self.check_restart(): return 0 self.stage_files() # check mandatory properties method, over, under = getCombinedMethod(self.method, self.out_log, self.__class__.__name__) checkResamplingType(self.type, self.out_log, self.__class__.__name__) sampling_strategy_over = getSamplingStrategy( self.sampling_strategy_over, self.out_log, self.__class__.__name__) sampling_strategy_under = getSamplingStrategy( self.sampling_strategy_under, self.out_log, self.__class__.__name__) # load dataset fu.log( 'Getting dataset from %s' % self.io_dict["in"]["input_dataset_path"], self.out_log, self.global_log) if 'column' in self.target: labels = getHeader(self.io_dict["in"]["input_dataset_path"]) skiprows = 1 header = 0 else: labels = None skiprows = None header = None data = pd.read_csv(self.io_dict["in"]["input_dataset_path"], header=None, sep="\s+|;|:|,|\t", engine="python", skiprows=skiprows, names=labels) train_df = data ranges = None le = preprocessing.LabelEncoder() cols_encoded = [] for column in train_df: # if type object, LabelEncoder.fit_transform if train_df[column].dtypes == 'object': cols_encoded.append(column) train_df[column] = le.fit_transform(train_df[column]) # defining X X = train_df.loc[:, train_df.columns != getTargetValue( self.target, self.out_log, self.__class__.__name__)] # calling resample method if self.method == 'smotetomek': method = method( smote=over(sampling_strategy=sampling_strategy_over), tomek=under(sampling_strategy=sampling_strategy_under), random_state=self.random_state_method) elif self.method == 'smotenn': method = method( smote=over(sampling_strategy=sampling_strategy_over), enn=under(sampling_strategy=sampling_strategy_under), random_state=self.random_state_method) fu.log( 'Target: %s' % (getTargetValue(self.target, self.out_log, self.__class__.__name__)), self.out_log, self.global_log) # resampling if self.type == 'regression': fu.log( 'Resampling regression dataset, continuous data will be classified', self.out_log, self.global_log) # call resampler class for Regression ReSampling rs = resampler() # Create n_bins classes for the dataset ranges, y, target_pos = rs.fit( train_df, target=getTargetValue(self.target, self.out_log, self.__class__.__name__), bins=self.n_bins, balanced_binning=self.balanced_binning, verbose=0) # Get the re-sampled data final_X, final_y = rs.resample(method, train_df, y) elif self.type == 'classification': # get X and y y = getTarget(self.target, train_df, self.out_log, self.__class__.__name__) # fit and resample final_X, final_y = method.fit_resample(X, y) target_pos = None # evaluate resampling if self.evaluate: fu.log( 'Evaluating data before resampling with RandomForestClassifier', self.out_log, self.global_log) cv = RepeatedStratifiedKFold( n_splits=self.evaluate_splits, n_repeats=self.evaluate_repeats, random_state=self.random_state_evaluate) # evaluate model scores = cross_val_score( RandomForestClassifier(class_weight='balanced'), X, y, scoring='accuracy', cv=cv, n_jobs=-1) if not np.isnan(np.mean(scores)): fu.log( 'Mean Accuracy before resampling: %.3f' % (np.mean(scores)), self.out_log, self.global_log) else: fu.log( 'Unable to calculate cross validation score, NaN was returned.', self.out_log, self.global_log) # log distribution before resampling dist = '' for k, v in Counter(y).items(): per = v / len(y) * 100 rng = '' if ranges: rng = str(ranges[k]) dist = dist + 'Class=%d, n=%d (%.3f%%) %s\n' % (k, v, per, rng) fu.log('Classes distribution before resampling:\n\n%s' % dist, self.out_log, self.global_log) # join final_X and final_y in the output dataframe if header is None: # numpy out_df = np.column_stack((final_X, final_y)) else: # pandas out_df = final_X.join(final_y) # if no header, convert np to pd if header is None: out_df = pd.DataFrame(data=out_df) # if cols encoded, decode them if cols_encoded: for column in cols_encoded: if header is None: out_df = out_df.astype({column: int}) out_df[column] = le.inverse_transform( out_df[column].values.ravel()) # if no header, target is in a different column if target_pos: t = target_pos else: t = getTargetValue(self.target, self.out_log, self.__class__.__name__) # log distribution after resampling if self.type == 'regression': ranges, y_out, _ = rs.fit(out_df, target=t, bins=self.n_bins, balanced_binning=self.balanced_binning, verbose=0) elif self.type == 'classification': y_out = getTarget(self.target, out_df, self.out_log, self.__class__.__name__) dist = '' for k, v in Counter(y_out).items(): per = v / len(y_out) * 100 rng = '' if ranges: rng = str(ranges[k]) dist = dist + 'Class=%d, n=%d (%.3f%%) %s\n' % (k, v, per, rng) fu.log('Classes distribution after resampling:\n\n%s' % dist, self.out_log, self.global_log) # evaluate resampling if self.evaluate: fu.log( 'Evaluating data after resampling with RandomForestClassifier', self.out_log, self.global_log) cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=42) # evaluate model scores = cross_val_score( RandomForestClassifier(class_weight='balanced'), final_X, y_out, scoring='accuracy', cv=cv, n_jobs=-1) if not np.isnan(np.mean(scores)): fu.log( 'Mean Accuracy after resampling a %s dataset with %s method: %.3f' % (self.type, resampling_methods[self.method]['method'], np.mean(scores)), self.out_log, self.global_log) else: fu.log( 'Unable to calculate cross validation score, NaN was returned.', self.out_log, self.global_log) # save output hdr = False if header == 0: hdr = True fu.log( 'Saving resampled dataset to %s' % self.io_dict["out"]["output_dataset_path"], self.out_log, self.global_log) out_df.to_csv(self.io_dict["out"]["output_dataset_path"], index=False, header=hdr) return 0
#evaluate knn with uncalibrated probabilities for imbalanced classification from numpy import mean from sklearn.datasets import make_classification from sklearn.model_selection import cross_val_score from sklearn.model_selection import RepeatedStratifiedKFold from sklearn.neighbors import KNeighborsClassifier #generate dataset X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=4) #define model model = KNeighborsClassifier() #define evaluation procedure cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) #evaluate model scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1) #summarize performance print('Mean ROC AUC: %.3f' % mean(scores))
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_split, stratify=y) x_train = x_train[:int(x_train.shape[0] * training_fraction)] y_train = y_train[:int(y_train.shape[0] * training_fraction)] # Train model print('Training model...') # history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=val_split) train_accuracies = [] train_losses = [] val_accuracies = [] val_losses = [] rskf = RepeatedStratifiedKFold(n_splits=kfolds_splits, n_repeats=kfolds_repeats) for train_index, test_index in rskf.split(x_train, y_train): history = model.fit(x_train[train_index], y_train[train_index], batch_size=batch_size, epochs=epochs, validation_data=(x_train[test_index], y_train[test_index])) train_accuracies.append(history.history['acc']) train_losses.append(history.history['loss']) val_accuracies.append(history.history['val_acc']) val_losses.append(history.history['val_loss']) train_accuracies = np.array(train_accuracies) train_losses = np.array(train_losses) val_accuracies = np.array(val_accuracies)
def test_get_n_splits_for_repeated_stratified_kfold(): n_splits = 3 n_repeats = 4 rskf = RepeatedStratifiedKFold(n_splits, n_repeats) expected_n_splits = n_splits * n_repeats assert_equal(expected_n_splits, rskf.get_n_splits())
def evaluate_model(X, y, model): # define evaluation procedure cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1) # evaluate model scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1) return scores
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337) # lgb ========================================================================= import lightgbm as lgb clf_lgb = lgb.LGBMClassifier(objective='binary', boosting_type='dart', verbose=-1, random_state=1337) scorer = make_scorer(roc_auc_score, greater_is_better=True, needs_proba=True) skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1337) rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1337) acc = cross_val_score(estimator=clf_lgb, X=X_train, y=y_train, cv=rskf, scoring='roc_auc') acc.mean(), acc.std() # GridSearchCV needs a predefined plan of the experiments param_grid = { 'learning_rate': [0.1], 'max_depth': [5, 7, 9, -1], 'min_data_in_leaf': [5, 10, 15], 'num_leaves': [10, 20, 30], 'bagging_freq': [7],
print(">> loading dataset ... ") path=Path("data/") train=pd.read_csv(path/"train.csv") #train = train[:100] train_ID_code = train["ID_code"].tolist() train=train.drop("ID_code",axis=1) test=pd.read_csv(path/"test.csv") test_ID_code = test["ID_code"].tolist() test=test.drop("ID_code",axis=1) ## valid_df = pd.DataFrame({"ID_code": train_ID_code , 'target':-1}) result=np.zeros(test.shape[0]) # rskf = RepeatedStratifiedKFold(n_splits=4, n_repeats=1,random_state=SEED) for counter,(train_index, valid_index) in enumerate(rskf.split(train, train.target),1): K.clear_session() model = None # Clearing the NN. model , model_name = create_model(init_dim=200,n0=200,n1=100,n2=50,act='relu') print ("fold:",counter, " -- model name:",model_name) sys.stdout.flush() #Train data t=train.iloc[train_index] v = train.iloc[valid_index] early_stopping = EarlyStopping(monitor='val_auc_roc', patience=2 , mode='max') model_path = model_name + '.h5' model_checkpoint = ModelCheckpoint(model_path, monitor='val_auc_roc' , mode='max', save_best_only=True, verbose=1) results = model.fit(t.drop("target",axis=1), t.target, validation_data=(v.drop("target",axis=1),v.target),
def __init__(self, n_splits=10, n_repeats=2, groupcount=10, random_state=0, strategy='quantile'): self.groupcount = groupcount self.strategy = strategy self.cvkwargs = dict(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state) self.cv = RepeatedStratifiedKFold(**self.cvkwargs) self.discretizer = KBinsDiscretizer(n_bins=self.groupcount, encode='ordinal', strategy=self.strategy)
def logi(request): hospital.objects.all().delete() inp = request.FILES['testinput'].name print(inp) train_df = pd.read_csv("junapp/static/junapp/data/train.csv") # Read CSV test data file into DataFrame test_df = pd.read_csv("junapp/static/junapp/data/" + inp) print('The number of samples into the train data is {}.'.format( train_df.shape[0])) a = train_df.isnull().sum() train_data = train_df.copy() train_data["How many days, immunization service is provided?"].fillna( train_df["How many days, immunization service is provided?"].median( skipna=True), inplace=True) train_data["How many bed are available in this hospital?"].fillna( train_df["How many bed are available in this hospital?"].median( skipna=True), inplace=True) train_data.isnull().sum() test_data = test_df.copy() test_data["How many days, immunization service is provided?"].fillna( test_df["How many days, immunization service is provided?"].median( skipna=True), inplace=True) a = test_data.isnull().sum() print(a) cols = [ "Does this health facility have its own building?", "Infrastructure Needs Repairing", "Number of rooms available in the health facilities? Number", "How many bed are available in this hospital?", "OPD service avaliable?", "Immunization Service Avaliable", "How many days, immunization service is provided?", "Laboraotry Service Avaliable", "ASRH (Adolescent Friendly Services) Service Avaliable", "Mental health Service Avaliable", "Substance abuse Service Avaliable", "Oral Health Service Avaliable" ] X = train_data[cols] y = train_data['Passed Threshold'] # Build a logreg and compute the feature importances model = LogisticRegression() # create the RFE model and select 8 attributes rfe = RFE(model, 13) rfe = rfe.fit(X, y) # summarize the selection of the attributes print('Selected features: %s' % list(X.columns[rfe.support_])) # ------------------------- rfecv = RFECV(estimator=LogisticRegression(), step=1, cv=10, scoring='accuracy') rfecv.fit(X, y) print("Optimal number of features: %d" % rfecv.n_features_) print('Selected features: %s' % list(X.columns[rfecv.support_])) Selected_features = [ "Does this health facility have its own building?", "Infrastructure Needs Repairing", "Number of rooms available in the health facilities? Number", "How many bed are available in this hospital?", "OPD service avaliable?", "Immunization Service Avaliable", "How many days, immunization service is provided?", "Laboraotry Service Avaliable", "ASRH (Adolescent Friendly Services) Service Avaliable", "Mental health Service Avaliable", "Substance abuse Service Avaliable", "Oral Health Service Avaliable" ] X = train_data[Selected_features] C = np.arange(1e-05, 5.5, 0.1) scoring = { 'Accuracy': 'accuracy', 'AUC': 'roc_auc', 'Log_loss': 'neg_log_loss' } log_reg = LogisticRegression() # Simple pre-processing estimators ############################################################################### std_scale = StandardScaler(with_mean=False, with_std=False) # std_scale = StandardScaler() # Defining the CV method: Using the Repeated Stratified K Fold ############################################################################### n_folds = 5 n_repeats = 5 rskfold = RepeatedStratifiedKFold(n_splits=n_folds, n_repeats=n_repeats, random_state=2) # Creating simple pipeline and defining the gridsearch ############################################################################### log_clf_pipe = Pipeline(steps=[('scale', std_scale), ('clf', log_reg)]) log_clf = GridSearchCV(estimator=log_clf_pipe, cv=rskfold, scoring=scoring, return_train_score=True, param_grid=dict(clf__C=C), refit='Accuracy') log_clf.fit(X, y) results = log_clf.cv_results_ # print('=' * 20) print("best params: " + str(log_clf.best_estimator_)) print("best params: " + str(log_clf.best_params_)) print('best score:', (log_clf.best_score_) * 100) # print('=' * 20) test_data['Passed Threshold'] = log_clf.predict( test_data[Selected_features]) test_data['Ward No'] = test_df['Ward No'] test_data['Address'] = test_df['Address'] test_data['Does this health facility have its own building?'] = test_df[ 'Does this health facility have its own building?'] test_data['Infrastructure Needs Repairing'] = test_df[ 'Infrastructure Needs Repairing'] test_data[ 'Number of rooms available in the health facilities? Number'] = test_df[ 'Number of rooms available in the health facilities? Number'] test_data['How many bed are available in this hospital?'] = test_df[ 'How many bed are available in this hospital?'] test_data['OPD service avaliable?'] = test_df['OPD service avaliable?'] test_data['Immunization Service Avaliable'] = test_df[ 'Immunization Service Avaliable'] test_data['Oral Health Service Avaliable'] = test_df[ 'Oral Health Service Avaliable'] test_data['Type of Health facility'] = test_df['Type of Health facility'] test_data['longt'] = test_df['longt'] test_data['lat'] = test_df['lat'] submission = test_data[[ 'Ward No', 'Address', 'Type of Health facility', 'Passed Threshold', 'longt', 'lat', 'Does this health facility have its own building?', 'Infrastructure Needs Repairing', 'Number of rooms available in the health facilities? Number', 'How many bed are available in this hospital?', 'OPD service avaliable?', 'Immunization Service Avaliable', 'Oral Health Service Avaliable', 'Type of Health facility' ]] submission.to_csv("submission.csv", index=False) submission.tail() # dict = {} result = pd.read_csv("submission.csv") # dict = { # 'ward': result['Ward No'], # 'address': result['Address'], # 'type': result['Type of Health facility'], # 'pass': result['Passed Threshold'], # 'longt': result['longt'], # 'lat': result['lat'] # } # out = hospital.objects.create(ward_no= dict['ward'], address= dict['address'], type= dict['type'], passed= dict['pass']) # out.save() print('Here') a = len(result['Ward No']) for i in range(a): hospitals = hospital.objects.create( ward_no=result['Ward No'][i], address=result['Address'][i], type=result['Type of Health facility'][i], passed=result['Passed Threshold'][i], lat=result['lat'][i], log=result['longt'][i], building=result['Does this health facility have its own building?'] [i], repair=result['Infrastructure Needs Repairing'][i], noofrooms=result[ 'Number of rooms available in the health facilities? Number'] [i], beds=result['How many bed are available in this hospital?'][i], optservice=result['OPD service avaliable?'][i], immunizationservice=result['Immunization Service Avaliable'][i], oralhealth=result['Oral Health Service Avaliable'][i] # lat=str(round(result['lat'][i],4)), # log=str(round(result['long'][i],4)) ) hospitals.save() # print(result['Ward No'][i]) # gethospitals = hospital.objects.all() # finaldata = [] # for i in gethospitals: # finaldata.append( # {'address': i.address, 'building': i.building, 'repair': i.repair, 'beds': i.beds, 'room': i.noofrooms}) # # return render(request, 'junapp/logistic.html', {'result': finaldata}) # return render(request, 'junapp/logistic.html') passed = hospital.objects.filter(passed=1).count() failed = hospital.objects.filter(passed=0).count() total = passed + failed gethospitals = hospital.objects.all() finaldata = [] for i in gethospitals: finaldata.append({ 'address': i.address, 'building': i.building, 'immunizationservice': i.immunizationservice, 'beds': i.beds, 'optservice': i.optservice, 'oralhealth': i.oralhealth, 'type': i.type, 'repair': i.repair, 'pass': i.passed, 'room': i.noofrooms }) finalresult = {'finaldata': finaldata, 'pass': passed, 'fail': failed} return render(request, 'junapp/logistic.html', {'result': finalresult})
def calculate_reliability(self, input, output, models_dict, file_path): """ Calculates reliability estimations for selected models based on test data within the single CV procedure. The calculated estimations are compared with prediction accuracy by using Pearson's test. :param input: array-like, shape (n_samples, n_features) Training data. :param output: array-like, shape (n_samples, ) True labels :param models_dict: dictionary Selected classifiers with details for grid-search and feature selection. :param file_path: String File path for saving data :return: DataFrame Calculated correlation coefficients. """ from ReliabilityEstimation import ReliabilityEstimation X, y = input, output df_res = pd.DataFrame(columns=[ 'Classifier', 'Corr_Oref', 'p_Oref', 'Corr_DENS', 'p_DENS', 'Corr_CNK', 'p_CNK', 'Corr_LCV', 'p_LCV' ]) for m in models_dict: rel_ref = [] rel_dens = [] rel_cnk = [] rel_lcv = [] acc_probabilities = [] predicted_label = [] true_label = [] clf = m['classifier'] fs = m['fs_method'] df_raw_res = pd.DataFrame(columns=[ 'Oref', 'DENS', 'CNK', 'LCV', 'Accuracy', 'Predicted_label', 'True_label' ]) # leave one out cross - validation skf = RepeatedStratifiedKFold(n_splits=self.N_inner, n_repeats=1, random_state=88) for train_index, test_index in skf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] fs_smote_clf = Pipeline([('oversampling', SMOTE(random_state=88, k_neighbors=3)), ('feature_selection', fs), ('classifier', clf)]) param_grid = m['grid'] classifier = copy.deepcopy(fs_smote_clf) if m['name'] == 'MLP_mRMR_50': gridsearch_cv = fs_smote_clf else: gridsearch_cv = GridSearchCV(fs_smote_clf, param_grid, cv=10, scoring='roc_auc') gridsearch_cv.fit(X_train, y_train) # predicted class y_predict = gridsearch_cv.predict(X_test) predicted_label.append(y_predict) # predicted probabilities probas_ = gridsearch_cv.predict_proba(X_test) acc = self.ind_classification_accuracy(probas_, y_test) acc_probabilities.append(acc) true_label.append(y_test) rel = ReliabilityEstimation() ref = list( map(lambda prob: rel.o_ref(prob), np.max(probas_, axis=1))) dens = list(map(lambda test: rel.DENS(X_train, test), X_test)) cnk = list( map( lambda test: rel.CNK( X_train, y_train, test, gridsearch_cv.predict_proba(test.reshape(1, -1))), X_test)) lcv = list( map( lambda test: rel.LCV(X_train, y_train, test, 40, classifier), X_test)) rel_ref.append(ref) rel_dens.append(dens) rel_cnk.append(cnk) rel_lcv.append(lcv) merged_rel_ref = np.concatenate(rel_ref).ravel() merged_rel_dens = np.concatenate(rel_dens).ravel() merged_rel_cnk = np.concatenate(rel_cnk).ravel() merged_rel_lcv = np.concatenate(rel_lcv).ravel() merged_acc = np.concatenate(acc_probabilities).ravel() merged_predicted_labels = np.concatenate(predicted_label).ravel() merged_true_labels = np.concatenate(true_label).ravel() df_raw_res["Oref"] = merged_rel_ref df_raw_res["DENS"] = merged_rel_dens df_raw_res["CNK"] = merged_rel_cnk df_raw_res["LCV"] = merged_rel_lcv df_raw_res["Accuracy"] = merged_acc df_raw_res["Predicted_label"] = merged_predicted_labels df_raw_res["True_label"] = merged_true_labels df_raw_res.to_csv(file_path + 'Reliability_data_' + m['name'] + '.csv', header=True) correlation_ref, p_ref = spearmanr(merged_rel_ref, merged_acc) correlation_dens, p_dens = spearmanr(merged_rel_dens, merged_acc) correlation_cnk, p_cnk = spearmanr(merged_rel_cnk, merged_acc) correlation_lcv, p_lcv = spearmanr(merged_rel_lcv, merged_acc) df_res = df_res.append( { 'Classifier': m['name'], 'Corr_Oref': correlation_ref, 'p_Oref': p_ref, 'Corr_DENS': correlation_dens, 'p_DENS': p_dens, 'Corr_CNK': correlation_cnk, 'p_CNK': p_cnk, 'Corr_LCV': correlation_lcv, 'p_LCV': p_lcv }, ignore_index=True) return df_res
def fit_model_kfold( features, model, analysis_type="classification", reduce_set=True, reduced_set_size=100, reduced_set_max_correlation=0.9, n_repeats=1, random_state=42, n_splits=None, compute_shap=True, ): """Classify graphs from extracted features with kfold. Args: features (dataframe): extracted features model (str): model to preform analysis analysis_type (str): 'classification' or 'regression' reduce_set (bool): is True, the classification will be rerun on a reduced set of top features (from shapely analysis) reduce_set_size (int): number of features to keep for reduces set reduced_set_max_correlation (float): to discared highly correlated top features in reduced set of features n_repeats (int): number of k-fold repeats random_state (int): rng seed n_splits (int): numbere of split for k-fold, None=automatic estimation compute_shap (bool): compute SHAP values or not Returns: (dict): dictionary with results """ if model is None: raise Exception("Please provide a model for classification") X, y = features_to_Xy(features) if analysis_type == "classification": if n_splits is None: n_splits = _number_folds(y) L.info("Using %s splits", str(n_splits)) folds = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state) elif analysis_type == "regression": if n_splits is None: n_splits = _number_folds(y) L.info("Using %s splits", str(n_splits)) folds = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state) acc_scores, shap_values = _evaluate_kfold(X, y, model, folds, analysis_type, compute_shap) _print_accuracy(acc_scores, analysis_type) if compute_shap: mean_shap_values, shap_feature_importance = _get_shap_feature_importance( shap_values) else: mean_shap_values = None shap_feature_importance = None analysis_results = { "X": X, "y": y, "acc_scores": acc_scores, "mean_shap_values": mean_shap_values, "shap_values": shap_values, "shap_feature_importance": shap_feature_importance, "reduced_features": None, } if not reduce_set: return analysis_results if not compute_shap: return analysis_results reduced_features = _get_reduced_feature_set( X, shap_feature_importance, n_top_features=reduced_set_size, alpha=reduced_set_max_correlation, ) reduced_acc_scores, reduced_shap_values = _evaluate_kfold( X[reduced_features], y, model, folds, analysis_type, compute_shap) _print_accuracy(reduced_acc_scores, analysis_type, reduced=True) ( reduced_mean_shap_values, reduced_shap_feature_importance, ) = _get_shap_feature_importance(reduced_shap_values) analysis_results.update({ "reduced_features": reduced_features, "reduced_shap_values": reduced_shap_values, "shap_values": shap_values, "reduced_acc_scores": reduced_acc_scores, "reduced_mean_shap_values": reduced_mean_shap_values, "reduced_shap_feature_importance": reduced_shap_feature_importance, }) return analysis_results
) param_grid = [ { "regressor__C": np.logspace(-3, 0, 4), "regressor__solver": ["liblinear"], "regressor__penalty": ["l1"] }, { "regressor__C": np.logspace(-3, 0, 4), "regressor__solver": ["lbfgs"], "regressor__penalty": ["l2"] # these are actually just the defaults } ] cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5) grid = GridSearchCV( pipe, param_grid, scoring="recall", cv=cv, return_train_score=True, verbose=10 ) ### Scoring ### def print_model_scores(model, scoring_list = [ accuracy_score,
def main(dataset_name): dataset = load_dataset() raw_data = np.asarray(dataset['raw']['data']) raw_label = np.asarray(dataset['raw']['label']) num_classes = len(np.unique(raw_label)) rskf = RepeatedStratifiedKFold(n_splits=k_folds, n_repeats=k_fold_reps, random_state=42) for fs_method, fs_range in fs_methods: print('FS-Method : ', fs_method.__name__) nfeats = [] accuracies = [] svc_accuracies = [] BAs = [] svc_BAs = [] mAPs = [] svc_mAPs = [] mus = [] name = dataset_name + '_mu_' + str(mu) print(name) for j, (train_index, test_index) in enumerate(rskf.split(raw_data, raw_label)): print('k_fold', j, 'of', k_folds*k_fold_reps) train_data, train_labels = raw_data[train_index].copy(), raw_label[train_index].copy() test_data, test_labels = raw_data[test_index].copy(), raw_label[test_index].copy() train_labels = to_categorical(train_labels, num_classes=num_classes) test_labels = to_categorical(test_labels, num_classes=num_classes) valid_features = np.where(np.abs(train_data).sum(axis=0) > 0)[0] if len(valid_features) < train_data.shape[1]: print('Removing', train_data.shape[1] - len(valid_features), 'zero features') train_data = train_data[:, valid_features] test_data = test_data[:, valid_features] model_kwargs = { # 'nclasses': num_classes, 'mu': mu / len(train_data), 'degree': 3 } print('mu :', model_kwargs['mu'], ', batch_size :', batch_size) svc_kwargs = { 'C': 1.0, 'solver': 0. } print('Starting feature selection') best_fs = 0 best_value = None for fs_value in fs_range: fs_class = fs_method(10, fs_value, matlab_engine=matlab_engine) fs_class.fit(train_data, 2. * train_labels[:, -1] - 1.) svc_train_data = fs_class.transform(train_data) norm = normalization_func() svc_train_data_norm = norm.fit_transform(svc_train_data) for s in [0, 1, 2, 3]: for my_c in [0.001, 0.01, 0.1, 0.5, 1.0, 1.4, 1.5, 1.6, 2.0, 2.5, 5.0, 25.0, 50.0, 100.0]: cmd = '-v 5 -s ' + str(s) + ' -c ' + str(my_c) + ' -q' cv = liblinearutil.train((2 * train_labels[:, -1] - 1).tolist(), svc_train_data_norm.tolist(), cmd) if cv > best_fs: best_fs = cv best_value = fs_value print('best fs_value: ', best_value) fs_class = fs_method(200, best_value, matlab_engine=matlab_engine) fs_class.fit(train_data, 2. * train_labels[:, -1] - 1.) print('Finishing feature selection') for i, n_features in enumerate([10, 50, 100, 150, 200]): n_accuracies = [] n_svc_accuracies = [] n_BAs = [] n_svc_BAs = [] n_mAPs = [] n_svc_mAPs = [] n_train_accuracies = [] print('n_features : ', n_features) fs_class.n_features_to_select = n_features svc_train_data = fs_class.transform(train_data) svc_test_data = fs_class.transform(test_data) norm = normalization_func() svc_train_data_norm = norm.fit_transform(svc_train_data) svc_test_data_norm = norm.transform(svc_test_data) bestcv = -1 bestc = None bestSolver = None for s in [0, 1, 2, 3]: for my_c in [0.001, 0.01, 0.1, 0.5, 1.0, 1.4, 1.5, 1.6, 2.0, 2.5, 5.0, 25.0, 50.0, 100.0]: cmd = '-v 5 -s ' + str(s) + ' -c ' + str(my_c) + ' -q' cv = liblinearutil.train((2 * train_labels[:, -1] - 1).tolist(), svc_train_data_norm.tolist(), cmd) if cv > bestcv: bestcv = cv bestc = my_c bestSolver = s svc_kwargs['C'] = bestc svc_kwargs['solver'] = bestSolver print('Best -> C:', bestc, ', s:', bestSolver, ', acc:', bestcv) for r in range(reps): model = train_SVC(svc_train_data_norm, train_labels, svc_kwargs) _, accuracy, test_pred = liblinearutil.predict( (2 * test_labels[:, -1] - 1).tolist(), svc_test_data_norm.tolist(), model, '-q' ) test_pred = np.asarray(test_pred) n_svc_accuracies.append(accuracy[0]) n_svc_BAs.append(balance_accuracy(test_labels, test_pred)) n_svc_mAPs.append(average_precision_score(test_labels[:, -1], test_pred)) del model model = train_Keras(svc_train_data, train_labels, svc_test_data, test_labels, model_kwargs) train_data_norm = model.normalization.transform(svc_train_data) test_data_norm = model.normalization.transform(svc_test_data) test_pred = model.predict(test_data_norm) n_BAs.append(balance_accuracy(test_labels, test_pred)) n_mAPs.append(average_precision_score(test_labels[:, -1], test_pred)) n_accuracies.append(model.evaluate(test_data_norm, test_labels, verbose=0)[-1]) n_train_accuracies.append(model.evaluate(train_data_norm, train_labels, verbose=0)[-1]) del model K.clear_session() print( 'n_features : ', n_features, ', acc : ', n_accuracies[-1], ', BA : ', n_BAs[-1], ', mAP : ', n_mAPs[-1], ', train_acc : ', n_train_accuracies[-1], ', svc_acc : ', n_svc_accuracies[-1], ', svc_BA : ', n_svc_BAs[-1], ', svc_mAP : ', n_svc_mAPs[-1], ) if i >= len(accuracies): accuracies.append(n_accuracies) svc_accuracies.append(n_svc_accuracies) BAs.append(n_BAs) mAPs.append(n_mAPs) svc_BAs.append(n_svc_BAs) svc_mAPs.append(n_svc_mAPs) nfeats.append(n_features) mus.append(model_kwargs['mu']) else: accuracies[i] += n_accuracies svc_accuracies[i] += n_svc_accuracies BAs[i] += n_BAs mAPs[i] += n_mAPs svc_BAs[i] += n_svc_BAs svc_mAPs[i] += n_svc_mAPs output_filename = directory + 'LinearSVC_' + fs_method.__name__ + '.json' if not os.path.isdir(directory): os.makedirs(directory) info_data = { 'reps': reps, 'classification': { 'mus': mus, 'n_features': nfeats, 'accuracy': accuracies, 'mean_accuracy': np.array(accuracies).mean(axis=1).tolist(), 'svc_accuracy': svc_accuracies, 'mean_svc_accuracy': np.array(svc_accuracies).mean(axis=1).tolist(), 'BA': BAs, 'mean_BA': np.array(BAs).mean(axis=1).tolist(), 'mAP': mAPs, 'mean_mAP': np.array(mAPs).mean(axis=1).tolist(), 'svc_BA': svc_BAs, 'svc_mean_BA': np.array(svc_BAs).mean(axis=1).tolist(), 'svc_mAP': svc_mAPs, 'svc_mean_mAP': np.array(svc_mAPs).mean(axis=1).tolist(), } } for k, v in info_data['classification'].items(): if 'mean' in k: print(k, v) with open(output_filename, 'w') as outfile: json.dump(info_data, outfile)
sbsMinIdx = np.random.choice(minIdx, int(majSize * imbRatio), replace=False) minDel = np.setdiff1d(minIdx, sbsMinIdx) if len(minDel) > 0: imbY = np.delete(imbY, minDel) imbX = np.delete(imbX, minDel, axis=0) print("to size: " + str(np.sum(imbY == c))) techNames = [] imbSizesTxt = [] for techType, a in [['none', 0], ['WeightedBase', 0], ['SMOTE', 0], ['mixup', 0.1], ['remix', 0.1]]: techNames = np.append(techNames, techType + str(a)) imbSizesTxt = np.append(imbSizesTxt, str(imbRatio)) rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=10, random_state=36851234) tmpGm = np.array([]) tmpFm = np.array([]) tmpBa = np.array([]) tmpBp = np.array([]) tmpBmc = np.array([]) tmpBb = np.array([]) for train_index, test_index in rskf.split(imbX, imbY): X_train, X_test = imbX[train_index, :], imbX[test_index, :] y_train, y_test = imbY[train_index], imbY[test_index] scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) X_train = np.clip(X_train, -5, 5) X_test = np.clip(X_test, -5, 5)
def runBS(input, output, classifier, oDim, lenTrainer): inputClass = input[:lenTrainer, :] inputValid = input[lenTrainer:, :] outputClass = output[:lenTrainer] outputValid = output[lenTrainer:] decision = [] success = 0 probs = np.zeros((len(outputClass), oDim)) nt = np.zeros(len(outputClass)) #preds = [[]]*len(outputClass) random_state = np.random.RandomState(0) rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=np.power(2, 10), random_state=random_state) for train, test in rskf.split(inputClass, outputClass): random_state = np.random.RandomState(0) clf = None if classifier['type'] == 'svm': clf = svm.SVC(kernel=classifier['kernel'], degree=classifier['degree'], probability=True, random_state=random_state) elif classifier['type'] == 'lda': clf = LDA(solver="svd", store_covariance=True) elif classifier['type'] == 'knn': clf = KNeighborsClassifier(5) probas_ = clf.fit(inputClass[train], outputClass[train]).predict_proba(inputClass[test]) #pred_ = clf.fit(inputClass[train], outputClass[train]).predict(inputClass[test]) ''' print(probas_) print(pred_) print(outputClass[train]) ''' for ip, it in enumerate(test): probs[it, :] += probas_[ip, :] #preds[it].append(pred_[ip]) nt[it] += 1.0 # Validation part clf_v = None if classifier['type'] == 'svm': clf_v = svm.SVC(kernel=classifier['kernel'], degree=classifier['degree'], probability=True, random_state=random_state) elif classifier['type'] == 'lda': clf_v = LDA(solver="svd", store_covariance=True) elif classifier['type'] == 'knn': clf_v = KNeighborsClassifier(5) probas_v = clf_v.fit(inputClass, outputClass).predict_proba(inputValid) #pred_v = clf_v.fit(inputClass, outputClass).predict(inputValid) prob_v = [] pred_v = [] for ipv in probas_v: prob_v.append(max(ipv)) pred_v.append(np.argmax(ipv)) prob_v = np.array(prob_v) pred_v = np.array(pred_v) ''' prob_t = []; pred_t = [] for ie in range(len(nt)): med = probs[it,:]/nt[it] prob_t.append(max(med)) (values, counts) = np.unique(preds[it], return_counts=True) #pred_t.append(values[np.argmax(counts)]) pred_t.append(np.argmax(med)) prob_t = np.array(prob_t) pred_t = np.array(pred_t) ''' score_t = np.zeros((len(outputClass), oDim)) for ip in range(len(outputClass)): score_t[ip, :] = probs[ip, :] / nt[ip] pred_t = [] prob_t = [] for il in score_t: posLab = np.argmax(il) prob_t.append(max(il)) if posLab == 0: pred_t.append(0) elif posLab == 1: pred_t.append(1) else: pred_t.append(2) pred_t = np.array(pred_t) prob_t = np.array(prob_t) return pred_t, prob_t, pred_v, prob_v
# Define models and parameters model = RandomForestClassifier(random_state=0, n_jobs=-1, class_weight='balanced_subsample') n_estimators = [400, 600, 700, 1000] max_features = [2, 3] max_depth = [4, 5] # Define grid search grid = dict(n_estimators=n_estimators, max_features=max_features, max_depth=max_depth) # 3 - Define how it will be seaarched, being stratified to preserve the two calsses, splitting in 10 and repeating 3 random times cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=6, random_state=0) # Create model with grid and CV structure grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, error_score=0, verbose=2) # 4 - Find hyperparameters grid_result = grid_search.fit(X_train, y_train.values.ravel()) # Summarize results ################################ print("!! Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
def main(): #if dataset is not provided on call terminate if len(sys.argv) < 3: print( "usage: python classifier_metrics.py <train_data_file> <test_data_file> " ) sys.exit() #pass dataset and get the matrix containing the data vectors and data targets ret_value = data_preprocessing(sys.argv[1]) data_matrix = ret_value[0] category_labels = ret_value[1] #create stratified k_fold iterator to calculate metrics k_fold = RepeatedStratifiedKFold(n_splits=10, n_repeats=3) sk_fold = StratifiedKFold(n_splits=10) metrics = [ 'accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted' ] #create RandomForest classifier and calculate metrics rf_clf = RandomForestClassifier(n_jobs=-1) rf_result = cross_validate(rf_clf, data_matrix, category_labels, cv=k_fold, scoring=metrics, return_train_score=False, n_jobs=-1) print "RANDOM FOREST:" for key, value in rf_result.iteritems(): print key + " : " + str(np.round_(np.mean(value), decimals=5)) print("\n") #create MNB classifier and calculate metrics #scale data matrix to positive values because MNB does not accept negative values #increasing scaling range increases accuracy up until scale is around 10 scaler = preprocessing.MinMaxScaler(feature_range=(0, 10), copy=True) scaled_data_matrix = scaler.fit_transform(data_matrix) mnb_clf = MultinomialNB() mnb_result = cross_validate(mnb_clf, scaled_data_matrix, category_labels, cv=k_fold, scoring=metrics, return_train_score=False, n_jobs=-1) print "MULTINOMIAL NAIVE BAYES:" for key, value in mnb_result.iteritems(): print key + " : " + str(np.round_(np.mean(value), decimals=5)) print("\n") #load hyperparameters for svc classifier from file hyperparameter_values.py kernel_hp = HYPERPARAMETER_VALUES['kernel'] c_hp = HYPERPARAMETER_VALUES['C'] gamma_hp = 'auto' if kernel_hp == 'rbf': gamma_hp = HYPERPARAMETER_VALUES['gamma'] #create svc classifier and calculate metrics svc_clf = SVC(kernel=kernel_hp, C=c_hp, gamma=gamma_hp) svc_result = cross_validate(svc_clf, data_matrix, category_labels, cv=k_fold, scoring=metrics, return_train_score=False, n_jobs=-1) print "svm.SVC (kernel=" + kernel_hp + " ,C=" + str( c_hp) + ", gamma=" + str(gamma_hp) + ")" for key, value in svc_result.iteritems(): print key + " : " + str(np.round_(np.mean(value), decimals=5)) print("\n") #create KNN(my implementation) classifier and calculate metrics knn_clf = MyKNN(k=10) knn_result = cross_validate(knn_clf, data_matrix, category_labels, cv=sk_fold, scoring=metrics, return_train_score=False) print "My implementation of KNN(brute force):" for key, value in knn_result.iteritems(): print key + " : " + str(np.round_(np.mean(value), decimals=5)) print("\n") #Beat the benchmark TITLE_WEIGHT = 5 #preprocess the data differently to achieve better score btb_ret_value = btb_data_preprocessing(sys.argv[1], title_weight=TITLE_WEIGHT, n_comp=250, ret_vectorizers=True) btb_data_matrix = btb_ret_value[0] btb_category_labels = btb_ret_value[1] #i chose svc because it was the better scoring classifier #calculate metrics btb_clf = SVC(kernel=kernel_hp, C=c_hp, gamma=gamma_hp, class_weight='balanced', probability=False) btb_result = cross_validate(btb_clf, btb_data_matrix, btb_category_labels, cv=k_fold, scoring=metrics, return_train_score=False, n_jobs=-1) print "(Beat the benchmark)svm.SVC (kernel=" + kernel_hp + " ,C=" + str( c_hp) + ", gamma=" + str(gamma_hp) + ")" for key, value in btb_result.iteritems(): print key + " : " + str(np.round_(np.mean(value), decimals=5)) print("\n") #train the classifier with the train data #cross_validate() does not train the classifier object passed to it but a copy of it btb_clf.fit(btb_data_matrix, btb_category_labels) #refit #get the vectorizers and transformers used to fit and trasform the train data count_vectorizer = btb_ret_value[2] tfidf_transformer = btb_ret_value[3] svd = btb_ret_value[4] le = btb_ret_value[5] #read test data and trasform them using the above vectorizers/transformers test_data = pd.read_csv(sys.argv[2], sep="\t") test_redu_matrix = test_data_transformation(test_data, count_vectorizer, tfidf_transformer, svd, TITLE_WEIGHT) #do the class predictions for the test data test_category_pred = btb_clf.predict(test_redu_matrix) #store predictions to file create_pred_file(test_data, test_category_pred, le) #store metrics to file create_eval_file(mnb_result, rf_result, svc_result, knn_result, btb_result, metrics)
def train_job(train_cfg, train_dmatrix, val_dmatrix, train_val_dmatrix, model_dir, checkpoint_dir, is_master): """Train and save XGBoost model using data on current node. If doing distributed training, XGBoost will use rabit to sync the trained model between each boosting iteration. Trained model is only saved if 'is_master' is True. :param train_cfg: Training hyperparameter configurations :param train_dmatrix: Training Data Matrix :param val_dmatrix: Validation Data Matrix :param train_val_dmatrix: Training + Validation Data Matrix :param model_dir: Directory where model will be saved :param is_master: True if single node training, or the current node is the master node in distributed training. """ # Parse arguments for train() API num_round = train_cfg.pop("num_round") # Parse arguments for intermediate model callback save_model_on_termination = train_cfg.pop('save_model_on_termination', "false") # Evaluation metrics to use with train() API tuning_objective_metric_param = train_cfg.pop("_tuning_objective_metric", None) eval_metric = train_cfg.get("eval_metric") cleaned_eval_metric, configured_feval, tuning_objective_metric = train_utils.get_eval_metrics_and_feval( tuning_objective_metric_param, eval_metric) if cleaned_eval_metric: train_cfg['eval_metric'] = cleaned_eval_metric else: train_cfg.pop('eval_metric', None) early_stopping_rounds = train_cfg.pop('early_stopping_rounds', None) early_stopping_data_name = 'validation' if val_dmatrix else None early_stopping_metric = None if early_stopping_rounds: if tuning_objective_metric: early_stopping_metric = tuning_objective_metric[-1] elif eval_metric: early_stopping_metric = eval_metric[-1] logging.info("Train matrix has {} rows and {} columns".format( train_dmatrix.num_row(), train_dmatrix.num_col())) if val_dmatrix: logging.info("Validation matrix has {} rows".format( val_dmatrix.num_row())) try: kfold = train_cfg.pop("_kfold", None) if kfold is None: xgb_model, iteration, callbacks, watchlist = get_callbacks_watchlist( train_dmatrix=train_dmatrix, val_dmatrix=val_dmatrix, model_dir=model_dir, checkpoint_dir=checkpoint_dir, early_stopping_data_name=early_stopping_data_name, early_stopping_metric=early_stopping_metric, early_stopping_rounds=early_stopping_rounds, save_model_on_termination=save_model_on_termination, is_master=is_master) add_debugging(callbacks=callbacks, hyperparameters=train_cfg, train_dmatrix=train_dmatrix, val_dmatrix=val_dmatrix) bst = xgb.train(train_cfg, train_dmatrix, num_boost_round=num_round - iteration, evals=watchlist, feval=configured_feval, callbacks=callbacks, xgb_model=xgb_model, verbose_eval=False) else: num_cv_round = train_cfg.pop("_num_cv_round", 1) logging.info( "Run {}-round of {}-fold cross validation with {} rows".format( num_cv_round, kfold, train_val_dmatrix.num_row())) bst = [] evals_results = [] num_class = train_cfg.get("num_class", None) objective = train_cfg.get("objective", None) # RepeatedStratifiedKFold expects X as array-like of shape (n_samples, n_features) X = range(train_val_dmatrix.num_row()) y = train_val_dmatrix.get_label( ) if num_class or objective.startswith("binary:") else None rkf = RepeatedStratifiedKFold(n_splits=kfold, n_repeats=num_cv_round) if y is not None \ else RepeatedKFold(n_splits=kfold, n_repeats=num_cv_round) for train_index, val_index in rkf.split(X=X, y=y): cv_train_dmatrix = train_val_dmatrix.slice(train_index) cv_val_dmatrix = train_val_dmatrix.slice(val_index) xgb_model, iteration, callbacks, watchlist = get_callbacks_watchlist( train_dmatrix=cv_train_dmatrix, val_dmatrix=cv_val_dmatrix, model_dir=model_dir, checkpoint_dir=checkpoint_dir, early_stopping_data_name=early_stopping_data_name, early_stopping_metric=early_stopping_metric, early_stopping_rounds=early_stopping_rounds, save_model_on_termination=save_model_on_termination, is_master=is_master, fold=len(bst)) add_debugging(callbacks=callbacks, hyperparameters=train_cfg, train_dmatrix=cv_train_dmatrix, val_dmatrix=cv_val_dmatrix) evals_result = {} logging.info( "Train cross validation fold {}".format((len(bst) % kfold) + 1)) booster = xgb.train(train_cfg, cv_train_dmatrix, num_boost_round=num_round - iteration, evals=watchlist, feval=configured_feval, evals_result=evals_result, callbacks=callbacks, xgb_model=xgb_model, verbose_eval=False) bst.append(booster) evals_results.append(evals_result) if len(bst) % kfold == 0: logging.info( "The metrics of round {} cross validation".format( int(len(bst) / kfold))) print_cv_metric(num_round, evals_results[-kfold:]) if num_cv_round > 1: logging.info( "The overall metrics of {}-round cross validation".format( num_cv_round)) print_cv_metric(num_round, evals_results) except Exception as e: for customer_error_message in CUSTOMER_ERRORS: if customer_error_message in str(e): raise exc.UserError(str(e)) exception_prefix = "XGB train call failed with exception" raise exc.AlgorithmError("{}:\n {}".format(exception_prefix, str(e))) if not os.path.exists(model_dir): os.makedirs(model_dir) if is_master: if type(bst) is not list: model_location = os.path.join(model_dir, MODEL_NAME) bst.save_model(model_location) logging.debug("Stored trained model at {}".format(model_location)) else: for fold in range(len(bst)): model_location = os.path.join(model_dir, f"{MODEL_NAME}-{fold}") bst[fold].save_model(model_location) logging.debug("Stored trained model {} at {}".format( fold, model_location))
traindataall = datapreparing['traindataall'] testdataall = datapreparing['testdataall'] WHO_train = traindataall['WHO'] WHO_test = testdataall['WHO'] print("Preparing data done.") ALLgroup_bestmodscore = [] ALLgroup_bestmodmeanscore = [] ALLgroupresults = {} for key in select_features.keys(): val = select_features[key] select_features_mod = val select_U_features_mod = list(val.values())[0] select_U_data_mod = list(val.values())[1] select_U_data_test_mod = list(val.values())[2] rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=0) k = 0 ALLmodscores = [] ALLmod = {} LRscores = [] SVMscores = [] KNNscores = [] NBscores = [] RFscores = [] Stackscores = [] Lassoscores = [] param_grid_svm = { 'kernel': ['linear', 'rbf'], 'C': [0.1, 1, 10, 50],
# Creamos un ColumnTransformer para el StandardScaler scaler = ColumnTransformer([ ('scaler_media', scaler_media, slice(0, 8)), ('scaler_moda', scaler_moda, slice(8, len(X.columns))) ]) # Creamos el Pipeline incorporando ColumnTransformer y Clasificador pipeline = Pipeline([ ('imputer', imputer), ('scaler', scaler), ('svm', SVC(random_state=random_state, class_weight=class_weight)) ]) # InnerCV (GridSearchCV de 2-folds 5-times (stratified) para obtener mejores parámetros) rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=5, random_state=random_state) # inner grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring=SCORING, cv=rskf) # # OuterCV (Validación cruzada de 5 folds (stratified) para estimar Accuracy) # scores = cross_validate(estimator=grid_search, X=X, y=y, cv=5, error_score='raise', return_estimator=True, scoring=SCORING) # outer # print('Scores: {}' .format(scores['test_score'])) # print('Mean score: {}' .format(np.mean(scores['test_score']))) # # # Creamos clasificador 'tonto' y obtenemos resultados también con validación cruzada (CV=5) para tener resultados más realistas # dummy_clf = DummyClassifier(strategy='most_frequent', random_state=random_state) # dummy_scores = cross_validate(estimator=dummy_clf, X=X, y=y, cv=5, error_score='raise', return_estimator=True, scoring=SCORING) # print('Dummy scores: {}' .format(dummy_scores['test_score'])) # print('Dummy mean score: {}' .format(np.mean(dummy_scores['test_score']))) # Matriz de confusion results = cross_val_predict(grid_search, X, y, cv=5)
'feature_fraction': 0.02, 'learning_rate': 0.001, 'max_depth': 6, 'metric':'auc', 'min_data_in_leaf': 100, 'min_sum_hessian_in_leaf': 10.0, 'num_leaves': 13, 'n_jobs': 30, 'tree_learner': 'serial', 'objective': 'binary', 'verbosity': -1 } result=np.zeros(test.shape[0]) rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5,random_state=10) best_iteration , best_valid_auc = 0, 0 for counter,(train_index, valid_index) in enumerate(rskf.split(train, train.target),1): print ("Rep-Fold:",counter) sys.stdout.flush() #Train data t=train.iloc[train_index] trn_data = lgb.Dataset(t.drop("target",axis=1), label=t.target) #Validation data v=train.iloc[valid_index] val_data = lgb.Dataset(v.drop("target",axis=1), label=v.target) #Training model = lgb.train(param, trn_data, 1000000, feature_name=train.columns.tolist()[1:], valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 4000) result +=model.predict(test) ## feat imp gain = model.feature_importance('gain')