def _single_k_fold(classifier, catalog, entity, k, dir_io, **kwargs): predictions, test_set = None, [] dataset, positive_samples_index = train.build_training_set( catalog, entity, dir_io) k_fold, binary_target_variables = utils.prepare_stratified_k_fold( k, dataset, positive_samples_index) for train_index, test_index in k_fold.split(dataset, binary_target_variables): training, test = dataset.iloc[train_index], dataset.iloc[test_index] test_set.append(test) model = utils.init_model(classifier, dataset.shape[1], **kwargs) model.fit(training, positive_samples_index & training.index) preds = model.predict(test) K.clear_session() # Free memory if predictions is None: predictions = preds else: predictions |= preds test_set = concat(test_set) return ( predictions, _compute_performance(positive_samples_index & test_set.index, predictions, len(test_set)), )
def _nested_k_fold_with_grid_search(classifier, param_grid, catalog, entity, k, scoring, dir_io, **kwargs): dataset, positive_samples_index = train.build_training_set( catalog, entity, dir_io) model = utils.init_model(classifier, dataset.shape[1], **kwargs).kernel inner_k_fold, target = utils.prepare_stratified_k_fold( k, dataset, positive_samples_index) outer_k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=1269) grid_search = GridSearchCV( model, param_grid, scoring=scoring, n_jobs=-1, cv=inner_k_fold, verbose=1, ) result = [] dataset = dataset.to_numpy() for k, (train_index, test_index) in enumerate(outer_k_fold.split(dataset, target), 1): # Run grid search grid_search.fit(dataset[train_index], target[train_index]) # Let grid search compute the test score test_score = grid_search.score(dataset[test_index], target[test_index]) # No reason to keep trained models in memory. We will instead just dump them # to a file and keep the path best_model = grid_search.best_estimator_ model_path = os.path.join( dir_io, constants.LINKER_NESTED_CV_BEST_MODEL.format( catalog, entity, classifier, k), ) joblib.dump(best_model, model_path) LOGGER.info("Best model for fold %d dumped to '%s'", k, model_path) # Grid search best score is the train score result.append({ f'train_{scoring}': grid_search.best_score_, f'test_{scoring}': test_score, 'best_model': model_path, 'params': grid_search.best_params_, }) return result
def _average_k_fold(classifier, catalog, entity, k, dir_io, **kwargs): predictions, precisions, recalls, f_scores = None, [], [], [] dataset, positive_samples_index = train.build_training_set( catalog, entity, dir_io) k_fold, binary_target_variables = utils.prepare_stratified_k_fold( k, dataset, positive_samples_index) for train_index, test_index in k_fold.split(dataset, binary_target_variables): training, test = dataset.iloc[train_index], dataset.iloc[test_index] model = utils.init_model(classifier, dataset.shape[1], **kwargs) model.fit(training, positive_samples_index & training.index) preds = model.predict(test) K.clear_session() # Free memory p, r, f, _ = _compute_performance(positive_samples_index & test.index, preds, len(test)) if predictions is None: predictions = preds else: predictions |= preds precisions.append(p) recalls.append(r) f_scores.append(f) return ( predictions, mean(precisions), std(precisions), mean(recalls), std(recalls), mean(f_scores), std(f_scores), )