def verbose_wait(amr, clientview, return_train_scores): N = len(amr) pending = set(amr.msg_ids) while pending: try: clientview.wait(pending, 1e-3) except parallel.TimeoutError: pass n_completed = N - len(clientview.outstanding) finished = pending.difference(clientview.outstanding) pending = pending.difference(finished) if len(finished) > 0: print() for msg_id in finished: ar = clientview.get_result(msg_id) try: for result in ar.result: elapsed, params = result[-2], result[-1] test_score = result[1] if return_train_scores else result[0] left = '[CV engine={}] {} '.format(ar.engine_id, ', '.join('{}={}'.format(k, v) for k, v in params.items())) right = ' score = {:5f} {}'.format(test_score, short_format_time(elapsed)) print(left + right.rjust(70-len(left), '-')) except RemoteError as e: e.print_traceback() raise else: left = '\r[Parallel] {0:d}/{1:d} tasks finished'.format(n_completed, N) right = 'elapsed {0} '.format(short_format_time(amr.elapsed)) print(left + right.rjust(71-len(left)), end='') sys.stdout.flush() time.sleep(1 + round(amr.elapsed) - amr.elapsed) n_engines = len(set(e['engine_id'] for e in amr._metadata)) engine_time = sum((e.completed - e.submitted for e in amr._metadata), datetime.timedelta()).total_seconds() m1 = 'Elapsed walltime: {}'.format(short_format_time(amr.elapsed)) m2 = 'Elapsed engine time: {}'.format(short_format_time(engine_time)) m3a = 'Parallel speedup:' m3b = '{:.3f}'.format(engine_time/ amr.elapsed).rjust(len(m2)-len(m3a)) m4a = 'Number of engines:' m4b = '{}'.format(n_engines).rjust(len(m2)-len(m4a)) print('\n\nTasks completed') print('-'*len(m2)) print(m1) print(m2) print(m3a + m3b) print(m4a + m4b) print('-'*len(m2))
def fit_grid_point(base_estimator, parameters, X, y, sample_weight, train, test, verbose, **fit_params): """Run fit on one set of parameters""" if verbose > 1: start_time = time.time() msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.iteritems())) print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.') # update parameters of the classifier after a copy of its base structure estimator = clone(base_estimator) estimator.set_params(**parameters) X_train, y_train, sample_weight_train = _safe_split( estimator, X, y, sample_weight, train) X_test, y_test, sample_weight_test = _safe_split( estimator, X, y, sample_weight, test, train) if sample_weight is not None: fit_params = fit_params.copy() fit_params['sample_weight'] = sample_weight_train if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(time.time() - start_time)) print "[BoostGridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg) return estimator, parameters, train, test
def fit_grid_point(base_estimator, parameters, X, y, sample_weight, train, test, verbose, **fit_params): """Run fit on one set of parameters""" if verbose > 1: start_time = time.time() msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.iteritems())) print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.') # update parameters of the classifier after a copy of its base structure estimator = clone(base_estimator) estimator.set_params(**parameters) X_train, y_train, sample_weight_train = _safe_split( estimator, X, y, sample_weight, train) X_test, y_test, sample_weight_test = _safe_split(estimator, X, y, sample_weight, test, train) if sample_weight is not None: fit_params = fit_params.copy() fit_params['sample_weight'] = sample_weight_train if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) if verbose > 1: end_msg = "%s -%s" % ( msg, logger.short_format_time(time.time() - start_time)) print "[BoostGridSearchCV] %s %s" % ( (64 - len(end_msg)) * '.', end_msg) return estimator, parameters, train, test
def _fit_and_score(estimator, depthmaps, offset_points_projected, direction_vectors, true_joints, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, error_score='raise'): if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(depthmaps, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() depth_train, offsets_train, directions_train, truths_train = _safe_split(depthmaps, offset_points_projected, direction_vectors, true_joints, train) depth_test, offsets_test, directions_test, truths_test = _safe_split(depthmaps, offset_points_projected, direction_vectors, true_joints, test) try: estimator.fit(depth_train, offsets_train, directions_train, **fit_params) except Exception as e: if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)" ) else: test_score = _score(estimator, depth_test, truths_test, scorer) if return_train_score: train_score = _score(estimator, depth_train, truths_train, scorer) scoring_time = time.time() - start_time if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_score] if return_train_score else [] ret.extend([test_score, _num_samples(depth_test), scoring_time]) if return_parameters: ret.append(parameters) return ret
def _fit_and_score(estimator, Z, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, error_score='raise'): if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in list(parameters.items()))) print(("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))) fit_params = fit_params if fit_params is not None else {} if parameters is not None: estimator.set_params(**parameters) start_time = time.time() Z_train = Z[train] Z_test = Z[test] try: estimator.fit(Z_train, **fit_params) except Exception as e: if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)" ) else: test_score = _score(estimator, Z_test, scorer) if return_train_score: train_score = _score(estimator, Z_train, scorer) scoring_time = time.time() - start_time if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) print(("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))) ret = [train_score] if return_train_score else [] ret.extend([test_score, _num_samples(Z_test), scoring_time]) if return_parameters: ret.append(parameters) return ret
def score_each_boost(estimator, parameters, min_n_estimators, X, y, sample_weight, score_func, train, test, verbose): """Run fit on one set of parameters Returns the score and the instance of the classifier """ if verbose > 1: start_time = time.time() msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.iteritems())) print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.') X_test, y_test, sample_weight_test = _safe_split( estimator, X, y, sample_weight, test, train) test_score_params = {} if sample_weight is not None: test_score_params['sample_weight'] = sample_weight_test this_n_test_samples = _num_samples(X_test) all_scores = [] all_clf_params = [] n_test_samples = [] for i, y_pred in enumerate(estimator.staged_predict(X_test)): if i + 1 < min_n_estimators: continue score = score_func(y_test, y_pred, **test_score_params) all_scores.append(score) clf_para = copy(parameters) clf_para['n_estimators'] = i + 1 all_clf_params.append(clf_para) n_test_samples.append(this_n_test_samples) # boosting may have stopped early if len(all_scores) < estimator.n_estimators - min_n_estimators + 1: last_score = all_scores[-1] last_clf_params = all_clf_params[-1] for i in range(len(all_scores), estimator.n_estimators - min_n_estimators + 1): all_scores.append(last_score) clf_para = copy(last_clf_params) clf_para['n_estimators'] = i + 1 all_clf_params.append(clf_para) n_test_samples.append(this_n_test_samples) if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(time.time() - start_time)) print "[BoostGridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg) return all_scores, all_clf_params, n_test_samples
def score_each_boost(estimator, parameters, min_n_estimators, X, y, sample_weight, score_func, train, test, verbose): """Run fit on one set of parameters Returns the score and the instance of the classifier """ if verbose > 1: start_time = time.time() msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.iteritems())) print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.') X_test, y_test, sample_weight_test = _safe_split(estimator, X, y, sample_weight, test, train) test_score_params = {} if sample_weight is not None: test_score_params['sample_weight'] = sample_weight_test this_n_test_samples = _num_samples(X_test) all_scores = [] all_clf_params = [] n_test_samples = [] for i, y_pred in enumerate(estimator.staged_predict(X_test)): if i + 1 < min_n_estimators: continue score = score_func(y_test, y_pred, **test_score_params) all_scores.append(score) clf_para = copy(parameters) clf_para['n_estimators'] = i + 1 all_clf_params.append(clf_para) n_test_samples.append(this_n_test_samples) # boosting may have stopped early if len(all_scores) < estimator.n_estimators - min_n_estimators + 1: last_score = all_scores[-1] last_clf_params = all_clf_params[-1] for i in range(len(all_scores), estimator.n_estimators - min_n_estimators + 1): all_scores.append(last_score) clf_para = copy(last_clf_params) clf_para['n_estimators'] = i + 1 all_clf_params.append(clf_para) n_test_samples.append(this_n_test_samples) if verbose > 1: end_msg = "%s -%s" % ( msg, logger.short_format_time(time.time() - start_time)) print "[BoostGridSearchCV] %s %s" % ( (64 - len(end_msg)) * '.', end_msg) return all_scores, all_clf_params, n_test_samples
def fit_and_score_n_support(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_support=True, error_score='raise'): """Fit estimator and compute scores for a given dataset split. Parameters ---------- estimator : estimator object implementing 'fit' The object to use to fit the data. X : array-like of shape at least 2D The data to fit. y : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. scorer : callable A scorer callable object / function with signature ``scorer(estimator, X, y)``. train : array-like, shape (n_train_samples,) Indices of training samples. test : array-like, shape (n_test_samples,) Indices of test samples. verbose : integer The verbosity level. error_score : 'raise' (default) or numeric Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. parameters : dict or None Parameters to be set on the estimator. fit_params : dict or None Parameters that will be passed to ``estimator.fit``. return_train_score : boolean, optional, default: False Compute and return score on training set. return_parameters : boolean, optional, default: False Return parameters that has been used for the estimator. Returns ------- train_score : float, optional Score on training set, returned only if `return_train_score` is `True`. test_score : float Score on test set. n_test_samples : int Number of test samples. scoring_time : float Time spent for fitting and scoring in seconds. parameters : dict or None, optional The parameters that have been evaluated. """ if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)" ) else: test_score = _score(estimator, X_test, y_test, scorer) if return_train_score: train_score = _score(estimator, X_train, y_train, scorer) scoring_time = time.time() - start_time if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_score] if return_train_score else [] ret.extend([test_score, _num_samples(X_test), scoring_time]) if return_parameters: ret.append(parameters) if return_n_support: ret.append(estimator.n_support_) return ret
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, error_score='raise', return_estimator=False, return_idx=False): """Fit estimator and compute scores for a given dataset split. Parameters ---------- estimator : estimator object implementing 'fit' The object to use to fit the data. X : array-like of shape at least 2D The data to fit. y : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. scorer : A single callable or dict mapping scorer name to the callable If it is a single callable, the return value for ``train_scores`` and ``test_scores`` is a single float. For a dict, it should be one mapping the scorer name to the scorer callable object / function. The callable object / fn should have signature ``scorer(estimator, X, y)``. train : array-like, shape (n_train_samples,) Indices of training samples. test : array-like, shape (n_test_samples,) Indices of test samples. verbose : integer The verbosity level. error_score : 'raise' (default) or numeric Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. parameters : dict or None Parameters to be set on the estimator. fit_params : dict or None Parameters that will be passed to ``estimator.fit``. return_train_score : boolean, optional, default: False Compute and return score on training set. return_parameters : boolean, optional, default: False Return parameters that has been used for the estimator. return_n_test_samples : boolean, optional, default: False Whether to return the ``n_test_samples`` return_times : boolean, optional, default: False Whether to return the fit/score times. Returns ------- train_scores : dict of scorer name -> float, optional Score on training set (for all the scorers), returned only if `return_train_score` is `True`. test_scores : dict of scorer name -> float, optional Score on testing set (for all the scorers). n_test_samples : int Number of test samples. fit_time : float Time spent for fitting in seconds. score_time : float Time spent for scoring in seconds. parameters : dict or None, optional The parameters that have been evaluated. """ if verbose > 1: if parameters is None: msg = '' else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) test_scores = {} train_scores = {} if parameters is not None: estimator.set_params(**parameters) start_time = time.time() # do it for each patient X_train, y_train, X_test, y_test = _safe_split_multi( estimator, X, y, train, test) is_multimetric = not callable(scorer) n_scorers = len(scorer.keys()) if is_multimetric else 1 try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): if is_multimetric: test_scores = dict(zip(scorer.keys(), [error_score, ] * n_scorers)) if return_train_score: train_scores = dict(zip(scorer.keys(), [error_score, ] * n_scorers)) else: test_scores = error_score if return_train_score: train_scores = error_score warnings.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time # _score will return dict if is_multimetric is True test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric) score_time = time.time() - start_time - fit_time if return_train_score: train_scores = _score(estimator, X_train, y_train, scorer, is_multimetric) if verbose > 2: if is_multimetric: for scorer_name, score in test_scores.items(): msg += ", %s=%s" % (scorer_name, score) else: msg += ", score=%s" % test_scores if verbose > 1: total_time = score_time + fit_time end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_scores, test_scores] if return_train_score else [test_scores] if return_n_test_samples: ret.append(_num_samples(X_test)) if return_times: ret.extend([fit_time, score_time]) if return_parameters: ret.append(parameters) if return_estimator: ret.append(estimator) if return_idx: ret.extend([train, test]) return ret
def _fit_and_score(estimator, frame, feature_names, target_feature, scorer, parameters, verbose, scoring_params, train, test, is_regression, act_args, cv_fold, iteration): """Fits the current fold on the current parameters. Parameters ---------- estimator : H2OPipeline or H2OEstimator The estimator to fit frame : H2OFrame, shape=(n_samples, n_features) The training frame feature_names : iterable (str) The feature names on which to train target_feature : str The name of the target feature scorer : H2OScorer The scoring function parameters : dict The parameters to set in the estimator clone verbose : int The level of verbosity scoring_params : dict The parameters to pass as kwargs to the scoring function train : iterable, shape=(n_train_samples,) The train fold indices test : iterable, shape=(n_test_samples,) The test fold indices is_regression : bool Whether we are fitting a continuous target act_args : dict :class:``skutil.metrics.GainsStatisticalReport`` args if called from a :class:``skutil.h2o.H2OGainsRandomizedSearchCV``. Otherwise, these are unused. cv_fold : int The fold number for reporting iteration : int The iteration number for reporting Returns ------- out : list, shape=(4,) test_score : float The score produced by the ``_score`` method on the test fold of the training set. len(test) : int The number of samples included in the test fold of the training set. Used later for IID normalizing of test scores. estimator : ``H2OEstimator`` or ``H2OPipeline`` The fit pipeline or estimator. Used for later scoring on the validation set. parameters : dict The parameters used to fit this estimator. """ if parameters is None: parameters = {} if verbose > 1: if not parameters: msg = '' else: msg = 'Target: %s; %s' % (target_feature, ', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV (iter %i, fold %i)] %s %s" % (iteration, cv_fold, msg, (64 - len(msg)) * '.')) # h2o doesn't currently re-order rows... and sometimes will # complain for some reason. We need to sort our train/test idcs train = sorted(train) test = sorted(test) # if act_args, then it's a gains search. We just need to slice # our existing numpy arrays if act_args is not None: kwargs = { 'expo': act_args['expo'][test], 'loss': act_args['loss'][test], 'prem': act_args['prem'][test] if act_args['prem'] is not None else None } else: kwargs = scoring_params # generate split train_frame = frame[train, :] test_frame = frame[test, :] start_time = time.time() # it's probably a pipeline is_h2o_est = isinstance(estimator, H2OEstimator) if not is_h2o_est: estimator.set_params(**parameters) # the name setting should be taken care of pre-clone... # setattr(estimator, 'feature_names', feature_names) # setattr(estimator, 'target_feature',target_feature) # do fit estimator.fit(train_frame) else: # it's just an H2OEstimator # parm_dict = {} for k, v in six.iteritems(parameters): if '__' in k: raise ValueError('only one estimator passed to grid search, ' 'but multiple named parameters passed: %s' % k) # {parm_name : v} estimator._parms[k] = v # do train estimator.train(training_frame=train_frame, x=feature_names, y=target_feature) # score model test_score = _score(estimator, test_frame, target_feature, scorer, is_regression, **kwargs) # h2o is verbose.. if we are too, print a new line: if verbose > 1: print() # new line scoring_time = time.time() - start_time if verbose > 2: msg += ', score=%f' % test_score if verbose > 1: end_msg = '%s -%s' % (msg, logger.short_format_time(scoring_time)) print('[CV (iter %i, fold %i)] %s %s' % (iteration, cv_fold, (64 - len(end_msg)) * '.', end_msg)) print() # new line print() # new line out = [test_score, len(test), estimator, parameters] return out
def fit_grid_point(X, y, sample_weight, base_clf, clf_params, train, test, verbose, **fit_params): """Run fit on one set of parameters Returns the score and the instance of the classifier """ if verbose > 1: start_time = time.time() msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in clf_params.iteritems())) print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.') X, y = check_arrays(X, y) # update parameters of the classifier after a copy of its base structure clf = clone(base_clf) clf.set_params(**clf_params) if hasattr(base_clf, 'kernel') and hasattr(base_clf.kernel, '__call__'): # cannot compute the kernel values with custom function raise ValueError( "Cannot use a custom kernel function. " "Precompute the kernel matrix instead.") if getattr(base_clf, "_pairwise", False): # X is a precomputed square kernel matrix if X.shape[0] != X.shape[1]: raise ValueError("X should be a square kernel matrix") X_train = X[np.ix_(train, train)] X_test = X[np.ix_(test, train)] else: X_train = X[safe_mask(X, train)] X_test = X[safe_mask(X, test)] if y is not None: y_test = y[safe_mask(y, test)] y_train = y[safe_mask(y, train)] else: y_test = None y_train = None if sample_weight is not None: sample_weight_test = sample_weight[safe_mask(sample_weight, test)] sample_weight_train = sample_weight[safe_mask(sample_weight, train)] else: sample_weight_test = None sample_weight_train = None if sample_weight is not None: clf.fit(X_train, y_train, sample_weight=sample_weight_train, **fit_params) else: clf.fit(X_train, y_train, **fit_params) if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(time.time() - start_time)) print "[BoostGridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg) return clf, clf_params, train, test
def score_each_boost(X, y, sample_weight, clf, clf_params, min_n_estimators, train, test, loss_func, score_func, verbose): """Run fit on one set of parameters Returns the score and the instance of the classifier """ if hasattr(clf, 'kernel') and hasattr(clf.kernel, '__call__'): # cannot compute the kernel values with custom function raise ValueError( "Cannot use a custom kernel function. " "Precompute the kernel matrix instead.") X, y = check_arrays(X, y) if getattr(clf, "_pairwise", False): # X is a precomputed square kernel matrix if X.shape[0] != X.shape[1]: raise ValueError("X should be a square kernel matrix") X_train = X[np.ix_(train, train)] X_test = X[np.ix_(test, train)] else: X_train = X[safe_mask(X, train)] X_test = X[safe_mask(X, test)] if y is not None: y_test = y[safe_mask(y, test)] y_train = y[safe_mask(y, train)] else: y_test = None y_train = None if sample_weight is not None: sample_weight_test = sample_weight[safe_mask(sample_weight, test)] sample_weight_train = sample_weight[safe_mask(sample_weight, train)] else: sample_weight_test = None sample_weight_train = None if verbose > 1: start_time = time.time() msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in clf_params.iteritems())) print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.') if y is not None: if hasattr(y, 'shape'): this_n_test_samples = y.shape[0] else: this_n_test_samples = len(y) else: if hasattr(X, 'shape'): this_n_test_samples = X.shape[0] else: this_n_test_samples = len(X) all_scores = [] all_clf_params = [] n_test_samples = [] # TODO: include support for sample_weight in score functions if loss_func is not None or score_func is not None: for i, y_pred in enumerate(clf.staged_predict(X_test)): if i + 1 < min_n_estimators: continue if loss_func is not None: score = -loss_func(y_test, y_pred) elif score_func is not None: score = score_func(y_test, y_pred) all_scores.append(score) clf_para = copy(clf_params) clf_para['n_estimators'] = i + 1 all_clf_params.append(clf_para) n_test_samples.append(this_n_test_samples) else: if sample_weight_test is not None: for i, score in enumerate(clf.staged_score(X_test, y_test, sample_weight=sample_weight_test)): if i + 1 < min_n_estimators: continue all_scores.append(score) clf_para = copy(clf_params) clf_para['n_estimators'] = i + 1 all_clf_params.append(clf_para) n_test_samples.append(this_n_test_samples) else: for i, score in enumerate(clf.staged_score(X_test, y_test)): if i + 1 < min_n_estimators: continue all_scores.append(score) clf_para = copy(clf_params) clf_para['n_estimators'] = i + 1 all_clf_params.append(clf_para) n_test_samples.append(this_n_test_samples) # boosting may have stopped early if len(all_scores) < clf.n_estimators - min_n_estimators + 1: last_score = all_scores[-1] last_clf_params = all_clf_params[-1] for i in range(len(all_scores), clf.n_estimators - min_n_estimators + 1): all_scores.append(last_score) clf_para = copy(last_clf_params) clf_para['n_estimators'] = i + 1 all_clf_params.append(clf_para) n_test_samples.append(this_n_test_samples) if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(time.time() - start_time)) print "[BoostGridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg) return all_scores, all_clf_params, n_test_samples
def fit_grid_point(X, y, base_clf, clf_params, train, test, loss_func, score_func, verbose, param_id=None, **fit_params): """Run fit on one set of parameters Returns the score and the instance of the classifier """ if verbose > 1: start_time = time.time() msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in clf_params.iteritems())) print "[GridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.') # update parameters of the classifier after a copy of its base structure # FIXME we should be doing a clone here clf = copy.deepcopy(base_clf) clf.set_params(**clf_params) if isinstance(X, list) or isinstance(X, tuple): X_train = [X[i] for i, cond in enumerate(train) if cond] X_test = [X[i] for i, cond in enumerate(test) if cond] else: if sp.issparse(X): # For sparse matrices, slicing only works with indices # (no masked array). Convert to CSR format for efficiency and # because some sparse formats don't support row slicing. X = sp.csr_matrix(X) ind = np.arange(X.shape[0]) train = ind[train] test = ind[test] if hasattr(base_clf, 'kernel_function'): # cannot compute the kernel values with custom function raise ValueError( "Cannot use a custom kernel function. " "Precompute the kernel matrix instead.") if getattr(base_clf, 'kernel', '') == 'precomputed': # X is a precomputed square kernel matrix if X.shape[0] != X.shape[1]: raise ValueError("X should be a square kernel matrix") X_train = X[np.ix_(train, train)] X_test = X[np.ix_(test, train)] else: X_train = X[train] X_test = X[test] if y is not None: y_test = y[test] y_train = y[train] else: y_test = None y_train = None clf.fit(X_train, y_train, **fit_params) if loss_func is not None: y_pred = clf.predict(X_test) this_score = -loss_func(y_test, y_pred) elif score_func is not None: y_pred = clf.predict(X_test) this_score = score_func(y_test, y_pred) else: this_score = clf.score(X_test, y_test) if y is not None: if hasattr(y, 'shape'): this_n_test_samples = y.shape[0] else: this_n_test_samples = len(y) else: if hasattr(X, 'shape'): this_n_test_samples = X.shape[0] else: this_n_test_samples = len(X) if verbose > 2: msg += ", score=%f" % this_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(time.time() - start_time)) print "[GridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg) return param_id, clf_params, this_score
def _extended_fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, error_score='raise', extraOut="auto"): if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn( "Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: test_score = _score(estimator, X_test, y_test, scorer) if return_train_score: train_score = _score(estimator, X_train, y_train, scorer) scoring_time = time.time() - start_time if verbose > 2: msg += " , n=" + str(X_test.shape[0]) + ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_score] if return_train_score else [] ret.extend([test_score, _num_samples(X_test), scoring_time]) if return_parameters: ret.append(parameters) # Add additional return values extraRVs = {} if extraOut != None: extraRVs["counts"] = {"train": train.shape[0], "test": test.shape[0]} if "estimator" in extraOut: extraRVs["estimator"] = estimator if extraOut == "auto" or "predictions" in extraOut: assert test.shape[0] == X_test.shape[0] probabilities = estimator.predict_proba(X_test) probabilityByIndex = {} for exampleIndex, prediction in zip(test, probabilities): probabilityByIndex[exampleIndex] = prediction extraRVs["probabilities"] = probabilityByIndex if (extraOut == "auto" or "importances" in extraOut) and hasattr( estimator, "feature_importances_"): extraRVs["importances"] = estimator.feature_importances_ ret.append(extraRVs) return ret
def _fit_and_score( estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, error_score="raise", ): """ Fit estimator and compute scores for a given dataset split. """ if verbose > 1: if parameters is None: msg = "" else: msg = "%s" % (", ".join("%s=%s" % (k, v) for k, v in parameters.items())) LOG.info("[CV] %s %s", msg, (64 - len(msg)) * ".") # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == "raise": raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn( "Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning, ) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time test_score = [_score(estimator, X_test, y_test, s) for s in scorer] score_time = time.time() - start_time - fit_time if return_train_score: train_score = [ _score(estimator, X_train, y_train, s) for s in scorer ] if verbose > 2: msg += ", score=".join(("%f" % ts for ts in test_score)) if verbose > 1: total_time = score_time + fit_time end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time)) LOG.info("[CV] %s %s", (64 - len(end_msg)) * ".", end_msg) ret = [train_score, test_score] if return_train_score else [test_score] if return_n_test_samples: ret.append(_num_samples(X_test)) if return_times: ret.extend([fit_time, score_time]) if return_parameters: ret.append(parameters) return ret
def _fit_and_score_clean_test(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, error_score='raise'): if verbose > 1: if parameters is None: msg = '' else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} # MOD Get y_clean if available fit_params = fit_params.copy() y_clean = fit_params.pop('y_clean', None) fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) test_scores = {} train_scores = {} if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) # MOD use y_clean for test if available X_test, y_test = _safe_split(estimator, X, y_clean if y_clean is not None else y, test, train) is_multimetric = not callable(scorer) n_scorers = len(scorer.keys()) if is_multimetric else 1 try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): if is_multimetric: test_scores = dict( zip(scorer.keys(), [ error_score, ] * n_scorers)) if return_train_score: train_scores = dict( zip(scorer.keys(), [ error_score, ] * n_scorers)) else: test_scores = error_score if return_train_score: train_scores = error_score warnings.warn( "Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time # _score will return dict if is_multimetric is True test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric) score_time = time.time() - start_time - fit_time if return_train_score: train_scores = _score(estimator, X_train, y_train, scorer, is_multimetric) if verbose > 2: if is_multimetric: for scorer_name, score in test_scores.items(): msg += ", %s=%s" % (scorer_name, score) else: msg += ", score=%s" % test_scores if verbose > 1: total_time = score_time + fit_time end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_scores, test_scores] if return_train_score else [test_scores] if return_n_test_samples: ret.append(_num_samples(X_test)) if return_times: ret.extend([fit_time, score_time]) if return_parameters: ret.append(parameters) return ret
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, scorer_params, return_train_score=False, return_parameters=False): """Fit estimator and compute scores for a given dataset split. Parameters ---------- estimator : estimator object implementing 'fit' The object to use to fit the data. X : array-like of shape at least 2D The data to fit. y : array-like or None The target variable to try to predict in the case of supervised learning. scoring : callable A scorer callable object / function with signature ``scorer(estimator, X, y)``. train : array-like, shape = (n_train_samples,) Indices of training samples. test : array-like, shape = (n_test_samples,) Indices of test samples. verbose : integer The verbosity level. parameters : dict or None Parameters to be set on the estimator. fit_params : dict or None Parameters that will be passed to ``estimator.fit``. scorer_params : dict or None Parameters that will be passed to the scorer. return_train_score : boolean, optional, default: False Compute and return score on training set. return_parameters : boolean, optional, default: False Return parameters that has been used for the estimator. Returns ------- train_score : float, optional Score on training set, returned only if `return_train_score` is `True`. test_score : float Score on test set. n_test_samples : int Number of test samples. scoring_time : float Time spent for fitting and scoring in seconds. parameters : dict or None, optional The parameters that have been evaluated. """ if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust lenght of sample weights n_samples = _num_samples(X) fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, np.asarray(v)[train] if hasattr(v, '__len__') and len(v) == n_samples else v) for k, v in fit_params.items()]) # Same, but take both slices scorer_params = scorer_params if scorer_params is not None else {} train_scorer_params = dict([(k, np.asarray(v)[train] if hasattr(v, '__len__') and len(v) == n_samples else v) for k, v in scorer_params.items()]) test_scorer_params = dict([(k, np.asarray(v)[test] if hasattr(v, '__len__') and len(v) == n_samples else v) for k, v in scorer_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) test_score = _score(estimator, X_test, y_test, scorer, **test_scorer_params) if return_train_score: train_score = _score(estimator, X_train, y_train, scorer, **train_scorer_params) scoring_time = time.time() - start_time if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_score] if return_train_score else [] ret.extend([test_score, _num_samples(X_test), scoring_time]) if return_parameters: ret.append(parameters) return ret
def fit_grid_point_extended(X, y, base_estimator, parameters, train, test, scorer, verbose, loss_func=None, extraOut="auto", **fit_params): """Run fit on one set of parameters. Parameters ---------- X : array-like, sparse matrix or list Input data. y : array-like or None Targets for input data. base_estimator : estimator object This estimator will be cloned and then fitted. parameters : dict Parameters to be set on base_estimator clone for this grid point. train : ndarray, dtype int or bool Boolean mask or indices for training set. test : ndarray, dtype int or bool Boolean mask or indices for test set. scorer : callable or None. If provided must be a scorer callable object / function with signature ``scorer(estimator, X, y)``. verbose : int Verbosity level. **fit_params : kwargs Additional parameter passed to the fit function of the estimator. Returns ------- score : float Score of this parameter setting on given training / test split. parameters : dict The parameters that have been evaluated. n_samples_test : int Number of test samples in this split. """ if verbose > 1: start_time = time.time() msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[GridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')) # update parameters of the classifier after a copy of its base structure clf = clone(base_estimator) clf.set_params(**parameters) if hasattr(base_estimator, 'kernel') and callable(base_estimator.kernel): # cannot compute the kernel values with custom function raise ValueError("Cannot use a custom kernel function. " "Precompute the kernel matrix instead.") if not hasattr(X, "shape"): if getattr(base_estimator, "_pairwise", False): raise ValueError("Precomputed kernels or affinity matrices have " "to be passed as arrays or sparse matrices.") X_train = [X[idx] for idx in train] X_test = [X[idx] for idx in test] else: if getattr(base_estimator, "_pairwise", False): # X is a precomputed square kernel matrix if X.shape[0] != X.shape[1]: raise ValueError("X should be a square kernel matrix") X_train = X[np.ix_(train, train)] X_test = X[np.ix_(test, train)] else: X_train = X[safe_mask(X, train)] X_test = X[safe_mask(X, test)] if y is not None: y_test = y[safe_mask(y, test)] y_train = y[safe_mask(y, train)] clf.fit(X_train, y_train, **fit_params) if scorer is not None: this_score = scorer(clf, X_test, y_test) else: this_score = clf.score(X_test, y_test) else: clf.fit(X_train, **fit_params) if scorer is not None: this_score = scorer(clf, X_test) else: this_score = clf.score(X_test) if not isinstance(this_score, numbers.Number): raise ValueError("scoring must return a number, got %s (%s)" " instead." % (str(this_score), type(this_score))) if verbose > 2: msg += ", score=%f" % this_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(time.time() - start_time)) print("[GridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) extraRVs = {} if extraOut != None: if "estimator" in extraOut: extraRVs["estimator"] = clf if extraOut == "auto" or "predictions" in extraOut: predictions = clf.predict(X) predictionIndex = 0 predictionByIndex = {} for exampleIndex in safe_mask(X, test): predictionByIndex[exampleIndex] = predictions[predictionIndex] predictionIndex += 1 extraRVs["predictions"] = predictionByIndex if (extraOut == "auto" or "importances" in extraOut) and hasattr(clf, "feature_importances_"): extraRVs["importances"] = clf.feature_importances_ rvs = [this_score, parameters, _num_samples(X_test), extraRVs] return rvs
def monkeypatch_fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, return_estimator=False, error_score='raise-deprecating'): if verbose > 1: if parameters is None: msg = '' else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) test_scores = {} train_scores = {} if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) is_multimetric = not callable(scorer) n_scorers = len(scorer.keys()) if is_multimetric else 1 # =================================================================== # BEGIN MONKEYPATCH MODIFICATION # =================================================================== try: if isinstance(estimator, Pipeline): pipe = estimator est_name, estimator = pipe.steps.pop() fit_params_est = {} fit_param_keys = fit_params.keys() for pname in fit_param_keys: step, param = pname.split('__', 1) if step == est_name: fit_params_est[param] = fit_params.pop(pname) else: pipe = None if y_train is None: if pipe is not None: X_train = pipe.fit_transform(X_train, **fit_params) X_test = pipe.transform(X_test, **fit_params) fit_params = fit_params_est estimator.fit(X_train, **fit_params) else: if pipe is not None: X_train = pipe.fit_transform(X_train, y_train, **fit_params) X_test = pipe.transform(X_test, **fit_params) fit_params = fit_params_est estimator.fit(X_train, y_train, **fit_params) # =================================================================== # END MONKEYPATCH MODIFICATION # =================================================================== except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): if is_multimetric: test_scores = dict(zip(scorer.keys(), [error_score, ] * n_scorers)) if return_train_score: train_scores = dict(zip(scorer.keys(), [error_score, ] * n_scorers)) else: test_scores = error_score if return_train_score: train_scores = error_score warnings.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time # _score will return dict if is_multimetric is True test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric) score_time = time.time() - start_time - fit_time if return_train_score: train_scores = _score(estimator, X_train, y_train, scorer, is_multimetric) if verbose > 2: if is_multimetric: for scorer_name, score in test_scores.items(): msg += ", %s=%s" % (scorer_name, score) else: msg += ", score=%s" % test_scores if verbose > 1: total_time = score_time + fit_time end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_scores, test_scores] if return_train_score else [test_scores] if return_n_test_samples: ret.append(_num_samples(X_test)) if return_times: ret.extend([fit_time, score_time]) if return_parameters: ret.append(parameters) return ret
def nested_fit_and_score( estimator, X, y, scorer, train, test, verbose=1, parameters=None, fit_params=None, return_train_score=False, return_times=False, error_score='raise'): """ """ from sklearn.externals.joblib.logger import short_format_time # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) if verbose > 1: LOG.info('CV iteration: Xtrain=%d, Ytrain=%d/%d -- Xtest=%d, Ytest=%d/%d.', len(X_train), len(X_train) - sum(y_train), sum(y_train), len(X_test), len(X_test) - sum(y_test), sum(y_test)) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score LOG.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r", error_score, e) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time test_score = None score_time = 0.0 if len(set(y_test)) > 1: test_score = _score(estimator, X_test, y_test, scorer) score_time = time.time() - start_time - fit_time else: LOG.warn('Test set has no positive labels, scoring has been skipped ' 'in this loop.') if return_train_score: train_score = _score(estimator, X_train, y_train, scorer) acc_score = _score(estimator, X_test, y_test, check_scoring(estimator, scoring='accuracy')) if verbose > 0: total_time = score_time + fit_time if test_score is not None: LOG.info('Iteration took %s, score=%f, accuracy=%f.', short_format_time(total_time), test_score, acc_score) else: LOG.info('Iteration took %s, score=None, accuracy=%f.', short_format_time(total_time), acc_score) ret = { 'test': {'score': test_score, 'accuracy': acc_score} } if return_train_score: ret['train'] = {'score': train_score} if return_times: ret['times'] = [fit_time, score_time] return ret, estimator
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, error_score='raise', to_evaluate=None): """ Fit estimator and compute scores for a given dataset split. """ #if verbose > 1: if parameters is None: msg = '' else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) LOG.info("[CV] %s %s", msg, (64 - len(msg)) * '.') # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn( "Classifier fit failed. The score on this train-test" " partition for these parameters will be set to {}. " "Details: \n{} \n model: {}".format(error_score, e, estimator), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time test_score = {s: _score(estimator, X_test, y_test, s) for s in scorer} score_time = time.time() - start_time - fit_time if return_train_score: train_score = [ _score(estimator, X_train, y_train, s) for s in scorer ] #if verbose > 2: #msg += ", score=".join(('%f' % ts for ts in test_score)) msg += ", score=".format(test_score) #if verbose > 1: total_time = score_time + fit_time end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time)) LOG.info("[CV] %s %s", (64 - len(end_msg)) * '.', end_msg) ret = [train_score, test_score] if return_train_score else [test_score] ################################################################################################################ if to_evaluate: res_evaluation = dict() for ds_name, ds_vals in to_evaluate.items(): for s in scorer: res_evaluation[ds_name + "_" + s] = _score( estimator, ds_vals["x"], ds_vals["y"], s) ################################################################################################################ if return_n_test_samples: ret.append(_num_samples(X_test)) if return_times: ret.extend([fit_time, score_time]) if return_parameters: ret.append(parameters) if to_evaluate: ret.append(res_evaluation) """ for k, v in res_evaluation.items(): ret.append({k: v}) """ return np.squeeze(ret)
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, error_score='raise'): """ Fit estimator and compute scores for a given dataset split. """ if verbose > 1: if parameters is None: msg = '' else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) LOG.info("[CV] %s %s", msg, (64 - len(msg)) * '.') # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time test_score = [_score(estimator, X_test, y_test, s) for s in scorer] score_time = time.time() - start_time - fit_time if return_train_score: train_score = [_score(estimator, X_train, y_train, s) for s in scorer] if verbose > 2: msg += ", score=".join(('%f' % ts for ts in test_score)) if verbose > 1: total_time = score_time + fit_time end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time)) LOG.info("[CV] %s %s", (64 - len(end_msg)) * '.', end_msg) ret = [train_score, test_score] if return_train_score else [test_score] if return_n_test_samples: ret.append(_num_samples(X_test)) if return_times: ret.extend([fit_time, score_time]) if return_parameters: ret.append(parameters) return ret
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, error_score='raise'): """Fit estimator and compute scores for a given dataset split. Parameters ---------- estimator : estimator object implementing 'fit' The object to use to fit the data. X : array-like of shape at least 2D The data to fit. y : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. scorer : callable A scorer callable object / function with signature ``scorer(estimator, X, y)``. train : array-like, shape (n_train_samples,) Indices of training samples. test : array-like, shape (n_test_samples,) Indices of test samples. verbose : integer The verbosity level. error_score : 'raise' (default) or numeric Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. parameters : dict or None Parameters to be set on the estimator. fit_params : dict or None Parameters that will be passed to ``estimator.fit``. return_train_score : boolean, optional, default: False Compute and return score on training set. return_parameters : boolean, optional, default: False Return parameters that has been used for the estimator. Returns ------- train_score : float, optional Score on training set, returned only if `return_train_score` is `True`. test_score : float Score on test set. n_test_samples : int Number of test samples. scoring_time : float Time spent for fitting and scoring in seconds. parameters : dict or None, optional The parameters that have been evaluated. """ if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn( "Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: test_score = _score(estimator, X_test, y_test, scorer) if return_train_score: train_score = _score(estimator, X_train, y_train, scorer) scoring_time = time.time() - start_time if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_score] if return_train_score else [] ret.extend([test_score, _num_samples(X_test), scoring_time]) if return_parameters: ret.append(parameters) return ret
def _fit_and_score_keras2(method, X, y, scorer, train, test, verbose, parameters, fit_params, type="Classification", return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, error_score='raise'): """Fit estimator and compute scores for a given dataset split for KerasClassifier and KerasRegressor. Parameters ---------- estimator : estimator object implementing 'fit' The object to use to fit the data. X : array-like of shape at least 2D The data to fit. y : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. scorer : A single callable or dict mapping scorer name to the callable If it is a single callable, the return value for ``train_scores`` and ``test_scores`` is a single float. For a dict, it should be one mapping the scorer name to the scorer callable object / function. The callable object / fn should have signature ``scorer(estimator, X, y)``. train : array-like, shape (n_train_samples,) Indices of training samples. test : array-like, shape (n_test_samples,) Indices of test samples. verbose : integer The verbosity level. error_score : 'raise' (default) or numeric Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. parameters : dict or None Parameters to be set on the estimator. fit_params : dict or None Parameters that will be passed to ``estimator.fit``. return_train_score : boolean, optional, default: False Compute and return score on training set. return_parameters : boolean, optional, default: False Return parameters that has been used for the estimator. return_n_test_samples : boolean, optional, default: False Whether to return the ``n_test_samples`` return_times : boolean, optional, default: False Whether to return the fit/score times. session : Keras backend with a tensorflow session attached The keras backend session for applying K.clear_session() after the classifier or regressor has been train and scored given the split. This is mainly required to avoid posible Out Of Memory errors with tensorflow not deallocating the GPU memory after each iteration of the Cross Validation. Returns ------- train_scores : dict of scorer name -> float, optional Score on training set (for all the scorers), returned only if `return_train_score` is `True`. test_scores : dict of scorer name -> float, optional Score on testing set (for all the scorers). n_test_samples : int Number of test samples. fit_time : float Time spent for fitting in seconds. score_time : float Time spent for scoring in seconds. parameters : dict or None, optional The parameters that have been evaluated. """ from keras import backend as K import tensorflow as tf tf.logging.set_verbosity( tf.logging.ERROR) # This is useful to avoid the info log of tensorflow # The next 4 lines are for avoiding tensorflow to allocate all the GPU memory config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) K.set_session(sess) if verbose > 1: if parameters is None: msg = '' else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) test_scores = {} train_scores = {} estimator = None if type == "Classification": from keras.wrappers.scikit_learn import KerasClassifier estimator = KerasClassifier(build_fn=method, verbose=0) else: from keras.wrappers.scikit_learn import KerasRegressor estimator = KerasRegressor(build_fn=method, verbose=0) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) is_multimetric = not callable(scorer) n_scorers = len(scorer.keys()) if is_multimetric else 1 try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): if is_multimetric: test_scores = dict( zip(scorer.keys(), [ error_score, ] * n_scorers)) if return_train_score: train_scores = dict( zip(scorer.keys(), [ error_score, ] * n_scorers)) else: test_scores = error_score if return_train_score: train_scores = error_score warnings.warn( "Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time # _score will return dict if is_multimetric is True test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric) score_time = time.time() - start_time - fit_time if return_train_score: train_scores = _score(estimator, X_train, y_train, scorer, is_multimetric) if verbose > 2: if is_multimetric: for scorer_name, score in test_scores.items(): msg += ", %s=%s" % (scorer_name, score) else: msg += ", score=%s" % test_scores if verbose > 1: total_time = score_time + fit_time end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_scores, test_scores] if return_train_score else [test_scores] if return_n_test_samples: ret.append(_num_samples(X_test)) if return_times: ret.extend([fit_time, score_time]) if return_parameters: ret.append(parameters) # The estimator is erased del estimator # We assign the keras backend # Clean the session K.clear_session() # The garbage collector is called in order to ensure that the estimator is erased from memory for i in range(15): gc.collect() return ret
def _extended_fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, error_score='raise', extraOut="auto"): if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)" ) else: test_score = _score(estimator, X_test, y_test, scorer) if return_train_score: train_score = _score(estimator, X_train, y_train, scorer) scoring_time = time.time() - start_time if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_score] if return_train_score else [] ret.extend([test_score, _num_samples(X_test), scoring_time]) if return_parameters: ret.append(parameters) # Add additional return values extraRVs = {} if extraOut != None: extraRVs["counts"] = {"train":train.shape[0], "test":test.shape[0]} if "estimator" in extraOut: extraRVs["estimator"] = estimator if extraOut == "auto" or "predictions" in extraOut: assert test.shape[0] == X_test.shape[0] probabilities = estimator.predict_proba(X_test) probabilityByIndex = {} for exampleIndex, prediction in zip(test, probabilities): probabilityByIndex[exampleIndex] = prediction extraRVs["probabilities"] = probabilityByIndex if (extraOut == "auto" or "importances" in extraOut) and hasattr(estimator, "feature_importances_"): extraRVs["importances"] = estimator.feature_importances_ ret.append(extraRVs) return ret
def nested_fit_and_score(estimator, X, y, scorer, train, test, verbose=1, parameters=None, fit_params=None, return_train_score=False, return_times=False, error_score='raise'): """ """ from sklearn.externals.joblib.logger import short_format_time # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) if verbose > 1: LOG.info( 'CV iteration: Xtrain=%d, Ytrain=%d/%d -- Xtest=%d, Ytest=%d/%d.', len(X_train), len(X_train) - sum(y_train), sum(y_train), len(X_test), len(X_test) - sum(y_test), sum(y_test)) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score LOG.warning( "Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r", error_score, e) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time test_score = None score_time = 0.0 if len(set(y_test)) > 1: test_score = _score(estimator, X_test, y_test, scorer) score_time = time.time() - start_time - fit_time else: LOG.warning( 'Test set has no positive labels, scoring has been skipped ' 'in this loop.') if return_train_score: train_score = _score(estimator, X_train, y_train, scorer) acc_score = _score(estimator, X_test, y_test, check_scoring(estimator, scoring='accuracy')) if verbose > 0: total_time = score_time + fit_time if test_score is not None: LOG.info('Iteration took %s, score=%f, accuracy=%f.', short_format_time(total_time), test_score, acc_score) else: LOG.info('Iteration took %s, score=None, accuracy=%f.', short_format_time(total_time), acc_score) ret = {'test': {'score': test_score, 'accuracy': acc_score}} if return_train_score: ret['train'] = {'score': train_score} if return_times: ret['times'] = [fit_time, score_time] return ret, estimator
def _fit_and_score(estimator, Z, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, error_score='raise'): if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in list(parameters.items()))) print(("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))) fit_params = fit_params if fit_params is not None else {} if parameters is not None: estimator.set_params(**parameters) start_time = time.time() Z_train = Z[train] Z_test = Z[test] try: estimator.fit(Z_train, **fit_params) except Exception as e: if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn( "Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: test_score = _score(estimator, Z_test, scorer) if return_train_score: train_score = _score(estimator, Z_train, scorer) scoring_time = time.time() - start_time if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) print(("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))) ret = [train_score] if return_train_score else [] ret.extend([test_score, _num_samples(Z_test), scoring_time]) if return_parameters: ret.append(parameters) return ret
def _fit_and_score_multisignal(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, error_score='raise', logger=logger): """Fit estimator and compute scores for a given dataset split. Parameters ---------- estimator : estimator object implementing 'fit' The object to use to fit the data. X : array-like of shape at least 2D The data to fit. y : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. scorer : A single callable or dict mapping scorer name to the callable If it is a single callable, the return value for ``train_scores`` and ``test_scores`` is a single float. For a dict, it should be one mapping the scorer name to the scorer callable object / function. The callable object / fn should have signature ``scorer(estimator, X, y)``. train : array-like, shape (n_train_samples,) Indices of training samples. test : array-like, shape (n_test_samples,) Indices of test samples. verbose : integer The verbosity level. error_score : 'raise' (default) or numeric Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. parameters : dict or None Parameters to be set on the estimator. fit_params : dict or None Parameters that will be passed to ``estimator.fit``. return_train_score : boolean, optional, default: False Compute and return score on training set. return_parameters : boolean, optional, default: False Return parameters that has been used for the estimator. return_n_test_samples : boolean, optional, default: False Whether to return the ``n_test_samples`` return_times : boolean, optional, default: False Whether to return the fit/score times. Returns ------- train_scores : dict of scorer name -> float, optional Score on training set (for all the scorers), returned only if `return_train_score` is `True`. test_scores : dict of scorer name -> float, optional Score on testing set (for all the scorers). n_test_samples : int Number of test samples. fit_time : float Time spent for fitting in seconds. score_time : float Time spent for scoring in seconds. parameters : dict or None, optional The parameters that have been evaluated. """ if verbose > 1: if parameters is None: msg = '' else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) logger.info("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) test_scores = {} train_scores = {} if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split_multisignal(estimator, X, y, train) X_test, y_test = _safe_split_multisignal(estimator, X, y, test, train) is_multimetric = not callable(scorer) n_scorers = len(scorer.keys()) if is_multimetric else 1 try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): if is_multimetric: test_scores = dict(zip(scorer.keys(), [error_score, ] * n_scorers)) if return_train_score: train_scores = dict(zip(scorer.keys(), [error_score, ] * n_scorers)) else: test_scores = error_score if return_train_score: train_scores = error_score logger.warning("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time # _score will return dict if is_multimetric is True test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric) score_time = time.time() - start_time - fit_time if return_train_score: train_scores = _score(estimator, X_train, y_train, scorer, is_multimetric) if verbose > 2: if is_multimetric: for scorer_name, score in test_scores.items(): msg += ", %s=%s" % (scorer_name, score) else: msg += ", score=%s" % test_scores if verbose > 1: total_time = score_time + fit_time end_msg = "%s, total=%s" % (msg, short_format_time(total_time)) logger.info("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_scores, test_scores] if return_train_score else [test_scores] if return_n_test_samples: ret.append(_num_samples(X_test)) if return_times: ret.extend([fit_time, score_time]) if return_parameters: ret.append(parameters) return ret
def _fit_and_score(estimator, frame, feature_names, target_feature, scorer, parameters, verbose, scoring_params, train, test, is_regression, act_args, cv_fold, iteration): """Fits the current fold on the current parameters. Parameters ---------- estimator : H2OPipeline or H2OEstimator The estimator to fit frame : H2OFrame, shape=(n_samples, n_features) The training frame feature_names : iterable (str) The feature names on which to train target_feature : str The name of the target feature scorer : H2OScorer The scoring function parameters : dict The parameters to set in the estimator clone verbose : int The level of verbosity scoring_params : dict The parameters to pass as kwargs to the scoring function train : iterable, shape=(n_train_samples,) The train fold indices test : iterable, shape=(n_test_samples,) The test fold indices is_regression : bool Whether we are fitting a continuous target act_args : dict :class:``skutil.metrics.GainsStatisticalReport`` args if called from a :class:``skutil.h2o.H2OGainsRandomizedSearchCV``. Otherwise, these are unused. cv_fold : int The fold number for reporting iteration : int The iteration number for reporting Returns ------- out : list, shape=(4,) test_score : float The score produced by the ``_score`` method on the test fold of the training set. len(test) : int The number of samples included in the test fold of the training set. Used later for IID normalizing of test scores. estimator : ``H2OEstimator`` or ``H2OPipeline`` The fit pipeline or estimator. Used for later scoring on the validation set. parameters : dict The parameters used to fit this estimator. """ if parameters is None: parameters = {} if verbose > 1: if not parameters: msg = '' else: msg = 'Target: %s; %s' % (target_feature, ', '.join( '%s=%s' % (k, v) for k, v in parameters.items())) print("[CV (iter %i, fold %i)] %s %s" % (iteration, cv_fold, msg, (64 - len(msg)) * '.')) # h2o doesn't currently re-order rows... and sometimes will # complain for some reason. We need to sort our train/test idcs train = sorted(train) test = sorted(test) # if act_args, then it's a gains search. We just need to slice # our existing numpy arrays if act_args is not None: kwargs = { 'expo': act_args['expo'][test], 'loss': act_args['loss'][test], 'prem': act_args['prem'][test] if act_args['prem'] is not None else None } else: kwargs = scoring_params # generate split train_frame = frame[train, :] test_frame = frame[test, :] start_time = time.time() # it's probably a pipeline is_h2o_est = isinstance(estimator, H2OEstimator) if not is_h2o_est: estimator.set_params(**parameters) # the name setting should be taken care of pre-clone... # setattr(estimator, 'feature_names', feature_names) # setattr(estimator, 'target_feature',target_feature) # do fit estimator.fit(train_frame) else: # it's just an H2OEstimator # parm_dict = {} for k, v in six.iteritems(parameters): if '__' in k: raise ValueError('only one estimator passed to grid search, ' 'but multiple named parameters passed: %s' % k) # {parm_name : v} estimator._parms[k] = v # do train estimator.train(training_frame=train_frame, x=feature_names, y=target_feature) # score model test_score = _score(estimator, test_frame, target_feature, scorer, is_regression, **kwargs) # h2o is verbose.. if we are too, print a new line: if verbose > 1: print() # new line scoring_time = time.time() - start_time if verbose > 2: msg += ', score=%f' % test_score if verbose > 1: end_msg = '%s -%s' % (msg, logger.short_format_time(scoring_time)) print('[CV (iter %i, fold %i)] %s %s' % (iteration, cv_fold, (64 - len(end_msg)) * '.', end_msg)) print() # new line print() # new line out = [test_score, len(test), estimator, parameters] return out