def get_grid_scores(scores, parameters, n_samples, n_folds, iid): score_params_len = list(zip(scores, parameters, n_samples)) n_fits = len(score_params_len) scores = [] grid_scores = [] for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, parameters, this_n_test_samples in \ score_params_len[grid_start:grid_start + n_folds]: all_scores.append(this_score) if iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) grid_scores.append( _CVScoreTuple(parameters, score, np.array(all_scores))) return grid_scores
def fit_func(**params): params = apply_transforms(params, transforms) base_id = len(cv_scores) * len(cv) scores = PSOSearch.cross_val_score(base_index=base_id, estimator=est, parameters=params, loader=loader, cv=cv, scorer=scorer, fit_callback=fit_callback, cacher=cacher, callback=callback, mapper=mapper) cv_score = _CVScoreTuple(params, np.mean(scores), scores) cv_scores[base_id] = cv_score best_score_params = cv_scores.values()[np.argmax( np.array( map(lambda score: score.mean_validation_score, cv_scores.itervalues())))] best_score_mean = best_score_params.mean_validation_score best_score_std = np.std(best_score_params.cv_validation_scores) if callback: callback(description='%.3f+-%.3f' % (best_score_mean, best_score_std)) return scores.mean()
def pick_best_parameters(score_len_params, n_folds, iid): n_fits = len(score_len_params) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, parameters in \ score_len_params[grid_start:grid_start + n_folds]: all_scores.append(this_score) if iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) grid_scores.append( _CVScoreTuple(parameters, score, np.array(all_scores))) # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] return best
def func(x): parameters = self._list_to_grid_point(x, parameter_iterable) n_test_samples = 0 score = 0 all_scores = [] for train, test in cv: this_score, this_n_test_samples, _, parameters = \ _fit_and_score(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) grid_scores.append( _CVScoreTuple(parameters, score, np.array(all_scores))) #print 'In func:', x, score return score
def pick_best_parameters(score_len_params, n_folds, iid): n_fits = len(score_len_params) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, parameters in \ score_len_params[grid_start:grid_start + n_folds]: all_scores.append(this_score) if iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] return best
def grid_scores_(self): grid_scores = list() scores_per_config = defaultdict(list) config_list = list() for run_key in self.runhistory_.data: run_value = self.runhistory_.data[run_key] config_id = run_key.config_id cost = run_value.cost if config_id not in config_list: config_list.append(config_id) scores_per_config[config_id].append(cost) for config_id in config_list: scores = [1 - score for score in scores_per_config[config_id]] mean_score = np.mean(scores) config = self.runhistory_.ids_config[config_id] grid_score = _CVScoreTuple(config.get_dictionary(), mean_score, scores) grid_scores.append(grid_score) return grid_scores
def fit_ipp(self, X, y, grid): self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) if self.grid_parallel: scores, grid_scores = grid_cv_scores(self.estimator, X, y, grid, self.scoring, self.cv, self.profile, self.n_jobs, self.verbose) else: scores = [] # grid = grid_search.ParameterGrid(self.param_grid) grid_scores = []; for parameters in grid: self.estimator.set_params(**parameters) scores_cv = cross_val_score(self.estimator, X, y, self.scoring, self.cv, profile=self.profile) scores.append(np.array(scores_cv).mean()) grid_scores.append(grid_search._CVScoreTuple( parameters, scores_cv.mean(), scores_cv)) max_idx = np.array(scores).argmax() self.best_estimator_ = self.estimator.set_params(**list(grid)[max_idx]) self.best_params_ = list(grid)[max_idx] self.scores_ = scores self.best_score_ = np.array(scores).max() self.grid_scores_ = grid_scores if self.refit: self.best_estimator_.fit(X, y) return self
def grid_cv_scores(estimator, X, y, grid, scoring=None, cv=10, profile='net', n_jobs=-1, verbose=None): input_sets = [] if type(cv) == int: cv = cross_validation.KFold(len(X), n_folds=cv) for parameters in grid: estimator1 = clone(estimator) input_sets.append({ 'estimator':estimator1.set_params(**parameters), 'X':X, 'y':y, 'scoring':scoring, 'cv':cv }) dview = random_rc(profile=profile, n_jobs=n_jobs, n_executed_jobs=len(input_sets)) results = dview.map_sync(scores_out, input_sets) dview.client.close() scores = [] grid_scores = [] for ii in range(len(grid)): scores.append(results[ii].mean()) grid_scores.append(grid_search._CVScoreTuple( list(grid)[ii], results[ii].mean(), results[ii])) return scores, grid_scores
def _fit(self, rdd, labeler, parameter_iterable): if self.n_duplicates == 1: self.partitioner = FlySplit(self.n_splits) else: self.partitioner = FlyDuplicate(self.n_duplicates) rdd = self.partitioner.prepare_rdd(rdd) base_estimator = clone(self.estimator) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) rdd_partition_num = rdd.get_partitions() all_params = list(parameter_iterable) parameters = even_split(all_params, rdd_partition_num) out = rdd.mapPartitionsWithIndex( self._fit_partitions(labeler, parameters)).collect() # Out is a list of triplet: score, parameters, n_test_samples out = filter(None, out) out.sort(key=lambda x: all_params.index(x[1])) n_fits = len(out) n_folds = self.cv or 3 scores = list() grid_scores = list() for grid_start in xrange(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, parameters, this_n_test_samples in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) grid_scores.append( _CVScoreTuple(parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score best_estimator = clone(base_estimator).set_params(**best.parameters) self.best_estimator_ = best_estimator return self
def cached_results_to_cvscores(cache_results): results_by_params = defaultdict(list) for score, test_set_size, time, params in cache_results: results_by_params[frozenset(map(map_param, params.iteritems()))].append(score) grid_results = [] for set_params, all_scores in results_by_params.iteritems(): grid_results.append( _CVScoreTuple(dict(set_params), np.mean(all_scores), np.array(all_scores))) return grid_results
def __call__(self, params): p = dict() invalid_parameters = False failed_power = 3.0 for i, name in enumerate(self.parameterNames): p[name] = params[i] if self.parameters_restrictions and name in self.parameters_restrictions: if not self.parameters_restrictions[name](params[i]): invalid_parameters = True failed_power *= params[i] p.update(self.defaultParameters) print p if invalid_parameters: print 'invalid: ', -abs(failed_power) return -abs(failed_power) clf = self.classifierFactory(**p) scores = cross_validation.cross_val_score(clf, self.trainData, self.trainLabel, cv=self.cv, scoring='f1') print scores.mean() #todo: think about putting all scores to array/list self.grid_scores_.append(_CVScoreTuple(p, scores.mean(), scores)) if self.best_score_ is None or self.best_score_ < scores.mean(): self.best_score_ = scores.mean() self.best_params_ = p return scores.mean()
def fit(self, X, y=None, x_is_index=False): """ fit creates a task for every pair of folds and combination of hyperparameters in the grid it then distributes the tasks to ipyparallel view and waits for completion :param X: ndarray of data :param y: ndarray of target variables :param x_is_index: boolean variable to indicate that X is not the data itself, but the index of the data to be used on remote machines. Useful when sending the data by network is unfeasible """ if not self.loader: self.loader = lambda: (X, y) parameter_iterable = ParameterGrid(self.param_grid) """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) if x_is_index and self.loader is None: raise ValueError('no loader given') X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) train_test_parameters = ((train, test, apply_transforms(parameters, self.transforms)) \ for parameters in parameter_iterable for train, test in cv) length = len(parameter_iterable) * len(cv) if self.callback: self.callback(len(self.cacher), length) if x_is_index: X_to_pass = X y_to_pass = y if self.loader is None else None else: if self.loader is not None: X_to_pass = None y_to_pass = None else: X_to_pass = X y_to_pass = y # print('sequences') # sequences = [ # train_test_parameters, # [clone(base_estimator)] * length, # [X_to_pass] * length, # [y_to_pass] * length, # [self.verbose] * length, # [self.fit_params] * length, # [True] * length, # [self.scorer_] * length, # [x_is_index] * length, # ] f = partial(GridSearchCVParallel.my_fit_and_score, estimator=base_estimator, X=X_to_pass, y=y_to_pass, fit_params=self.fit_params, scorer=self.scorer_, x_is_index=x_is_index, loader=self.loader, fit_callback=self.fit_callback) iterable = itertools.ifilter(lambda (i, ttp): i not in self.cacher, enumerate(train_test_parameters)) results_by_params = defaultdict(list) for id, result in self.cacher.iteritems(): score, test_size, time, params = result results_by_params[frozenset(map(map_param, params.iteritems()))].append(score) try: for index, result in self.mapper(f, iterable): self.cacher[index] = result score, test_size, time, params = result results_by_params[frozenset(map( map_param, params.iteritems()))].append(score) if self.callback: best_scores = next( iter( sorted(itertools.ifilter( lambda scores: len(scores) == len(cv), results_by_params.values()), key=lambda scores: np.mean(scores), reverse=True)), [0]) self.callback(1, length, description='%.3f+-%.3f' % (np.mean(best_scores), np.std(best_scores))) except Exception as e: print(e) e_type, e_value, e_tb = sys.exc_info() traceback.print_tb(e_tb) # assert len(self.cacher) == length and (np.array(self.cacher.keys()) == np.arange(length)).all() # out = self.cacher.values() # # # Out is a list of triplet: score, estimator, n_test_samples # n_fits = len(out) # n_folds = len(cv) # # scores = list() # grid_scores = list() # for grid_start in range(0, n_fits, n_folds): # n_test_samples = 0 # score = 0 # all_scores = [] # for this_score, this_n_test_samples, _, parameters in \ # out[grid_start:grid_start + n_folds]: # all_scores.append(this_score) # if self.iid: # this_score *= this_n_test_samples # n_test_samples += this_n_test_samples # score += this_score # if self.iid: # score /= float(n_test_samples) # else: # score /= float(n_folds) # scores.append((score, parameters)) # # TODO: shall we also store the test_fold_sizes? # grid_scores.append(_CVScoreTuple( # parameters, # score, # np.array(all_scores))) grid_scores = [] for set_params, all_scores in results_by_params.iteritems(): grid_scores.append( _CVScoreTuple(dict(set_params), np.mean(all_scores), np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores print(len(grid_scores)) # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _fit(self, Z, parameter_iterable): """Actual fitting, performing the search over parameters.""" self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) cv = self.cv cv = _check_cv(cv, Z) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch, backend="threading" )( delayed(_fit_and_score)(clone(base_estimator), Z, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for parameters in parameter_iterable for train, test in cv) # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) best_estimator.fit(Z, **self.fit_params) self.best_estimator_ = best_estimator return self
''' prediction ''' clf.fit(Xtr, ytr) yhat = clf.predict_proba(Xts) # print yhat.shape auc = compute_roc_auc_score_label_safe(yts, yhat[:, 1]) auc_cv[i] = auc auc_cv_rk[i, r] = auc # print 'iter: ', i, ' auc= ', auc # print param, auc_cv.mean(), auc_cv.ravel() # print res_selected_cv # print [s for i, s in enumerate(aFeatNames) if res_selected_cv[i] > 0] # print 'selected features: ', sum(res_selected_cv > 0) grid_scores.append( _CVScoreTuple(param, auc_cv.mean(), np.array(auc_cv))) # print param, auc_cv.mean(), auc_cv.std() # print grid_scores best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] best_params_ = best.parameters best_score_ = best.mean_validation_score # grid_search = GridSearchCV(pipe, parameters, verbose=1, n_jobs=2, cv=k_feat) # grid_search = GridSearchCV(clf, parameters, verbose=1, cv=cv, scoring='roc_auc') # grid_search.fit(Xt, yt) # print('Best features:', grid_search.best_estimator_.steps[0][1].k_feature_idx_) print("Best score: %0.3f" % best_score_)
def _fit(self, depthmaps, offset_points_projected, direction_vectors, true_joints, parameter_iterable): cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(depthmaps) if _num_samples(offset_points_projected) != n_samples: raise ValueError('offset_points_projected has a different number ' 'of samples ({0}) than data (depthmaps: {1} samples)'.format(_num_samples(offset_points_projected), n_samples)) if _num_samples(direction_vectors) != n_samples: raise ValueError('direction_vectors has a different number ' 'of samples ({0}) than data (depthmaps: {1} samples)'.format(_num_samples(direction_vectors), n_samples)) cv = _check_cv(cv, n_samples) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )( delayed(_fit_and_score)(clone(base_estimator), depthmaps, offset_points_projected, direction_vectors, true_joints, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for parameters in parameter_iterable for train, test in cv) # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) best_estimator.fit(depthmaps, offset_points_projected, direction_vectors, **self.fit_params) self.best_estimator_ = best_estimator return self
def _fit(self, X, y, parameter_iterable, en_celery=False): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch if en_celery: out = [] timestamp = timestamp = datetime.now().strftime("%Y%m%d%H%M%s") key = "sample_%s_%s" % (timestamp, int(round(random.random(), 8)*1e8)) red.set(key, pickle.dumps({'X': X, 'y': y})) grp = group(cjobs.fas_mp.s(clone(base_estimator), key, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for parameters in parameter_iterable for train, test in cv)() out = grp.get() red.delete(key) else: out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )( delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for parameters in parameter_iterable for train, test in cv) # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _extendedFit(self, X, y, parameter_iterable): estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print( "Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)( delayed(_extended_fit_and_score)(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for parameters in parameter_iterable for train, test in cv) # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() grid_extras = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] all_extras = [] for this_score, this_n_test_samples, _, parameters, extra in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) all_extras.append(extra) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append( _CVScoreTuple(parameters, score, np.array(all_scores))) grid_extras.append(all_extras) # Store the computed scores self.grid_scores_ = grid_scores self.extras_ = grid_extras # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: print "Refitting best estimator" # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) param_grid = [(parameters, train, test) for parameters in parameter_iterable for (train, test) in cv] # Because the original python code expects a certain order for the elements, we need to # respect it. indexed_param_grid = list(zip(range(len(param_grid)), param_grid)) par_param_grid = self.sc.parallelize(indexed_param_grid, len(indexed_param_grid)) X_bc = self.sc.broadcast(X) y_bc = self.sc.broadcast(y) scorer = self.scorer_ verbose = self.verbose fit_params = self.fit_params error_score = self.error_score fas = _fit_and_score def fun(tup): (index, (parameters, train, test)) = tup local_estimator = clone(base_estimator) local_X = X_bc.value local_y = y_bc.value res = fas(local_estimator, local_X, local_y, scorer, train, test, verbose, parameters, fit_params, return_parameters=True, error_score=error_score) return (index, res) indexed_out0 = dict(par_param_grid.map(fun).collect()) out = [indexed_out0[idx] for idx in range(len(param_grid))] X_bc.unpersist() y_bc.unpersist() # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator foldsForEstimator = {} cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) # Splits the data based on provided cross-validation splitting strategy. cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling \ {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch # Change from original scikit code: adding a new argument, # foldsForEstimator, to the _fit_and_score function to track metadata # for each estimator, for each fold. # _fit_and_score fits the estimator and computes the score for a given # data-split, for given parameters. out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)( delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, self.fit_params, foldsForEstimator, return_parameters=True, error_score=self.error_score) for parameters in parameter_iterable for train, test in cv) # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) # Computes the scores for each of the folds, for all the possible # parameters, and stores them in grid_scores. scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in out[ grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append( _CVScoreTuple(parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params(**best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator else: # If refit is false, we cannot _best_estimator_ is unavailable, and # further predictions can't be made on instance raise Warning( "Note: Refit has been set to false, which makes it impossible to " "make predictions using this GridSearchCV instance after fitting. " "Change refit to true to enable this") # Change from original scikit code: # Populate new field with necessary attributes for storing # cross-validation event self.grid_cv_event = [ X, foldsForEstimator, 0, type_of_target(y), self.best_estimator_, self.best_estimator_, n_folds ] return self
param=param, key=key, y_hat_valid_precomputed=d_yhat_valid_to_save_computation, auc_valid_precomputed=d_auc_valid_to_save_computation) clf_param_key = create_unique_key_param_patient(key, param) d_auc_valid_to_save_computation[clf_param_key] = out[0] d_yhat_valid_to_save_computation[clf_param_key] = out[1] auc_single_cv[:, key - 1] = out[0] ''' 2) add together prediction from each patient and compute AUC across all patients ''' auc_all_p = compute_auc_cv_for_all_p( d_data_train, d_yhat_valid_to_save_computation, Kinner, param, prob_calib_alg) gs_1.append( _CVScoreTuple(param, auc_single_cv[:, 0].mean(), np.array(auc_single_cv[:, 0]))) gs_2.append( _CVScoreTuple(param, auc_single_cv[:, 1].mean(), np.array(auc_single_cv[:, 1]))) gs_3.append( _CVScoreTuple(param, auc_single_cv[:, 2].mean(), np.array(auc_single_cv[:, 2]))) grid_scores_all.append( _CVScoreTuple(param, auc_all_p.mean(), np.array(auc_all_p))) best = sorted(grid_scores_all, key=lambda x: x.mean_validation_score, reverse=True)[0] best_params_ = best.parameters best_score_ = best.mean_validation_score
def fit(self, X, y=None, x_is_index=False, X_name='X', y_name='y'): parameter_iterable = ParameterGrid(self.param_grid) """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) # out = Parallel( # n_jobs=self.n_jobs, verbose=self.verbose, # pre_dispatch=pre_dispatch # )( # delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, # train, test, self.verbose, parameters, # self.fit_params, return_parameters=True, # error_score=self.error_score) # for parameters in parameter_iterable # for train, test in cv) train_test_parameters = ((train, test, parameters) \ for parameters in parameter_iterable for train, test in cv) length = len(parameter_iterable) * len(cv) if x_is_index: X_to_pass = X y_to_pass = None else: X_to_pass = None y_to_pass = None self.view.block = False # print('sequences') # sequences = [ # train_test_parameters, # [clone(base_estimator)] * length, # [X_to_pass] * length, # [y_to_pass] * length, # [self.verbose] * length, # [self.fit_params] * length, # [True] * length, # [self.scorer_] * length, # [x_is_index] * length, # ] f = partial(my_fit_and_score, estimator=clone(base_estimator), X=X_to_pass, y=y_to_pass, verbose=self.verbose, fit_params=self.fit_params, return_parameters=True, scorer=None, x_is_index=x_is_index, names=(X_name, y_name)) # print('before map') # import cProfile # # pr = cProfile.Profile() # pr.enable() chunksize = 10 out = self.view.map(f, itertools.islice(train_test_parameters, 0, length), ordered=False, block=False, chunksize=chunksize) # length / len(self.view)) # pr.disable() # pr.print_stats('cumulative') print('map called') if self.callback is not None: old_progress = out.progress while not out.ready(): self.callback(out.progress * chunksize, length, out.elapsed) if old_progress == out.progress and out.progress > 0: for id, info in self.view.queue_status(verbose=True).iteritems(): # print(id, info) if isinstance(info, dict) and 'queue' in info and len(info['queue']) > 0: print(id, info['queue']) pass old_progress = out.progress sleep(10) print('map ready') out = out.get() # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
Xtr = pipe.fit_transform(Xtr, ytr) Xts = pipe.transform(Xts) # print 'selected features: ', Xtr.shape clf.set_params(**param) clf.fit(Xtr, ytr, eval_metric='auc') ''' prediction ''' yhat = clf.predict_proba(Xts) auc = metrics.roc_auc_score(yts, yhat[:, 1]) auc_cv[jj] = auc # auc_cv[jj] = auc if auc > 0.5 else 1 - auc # print param, auc_cv.mean(), auc_cv.ravel() grid_scores.append(_CVScoreTuple(param, auc_cv.mean(), np.array(auc_cv))) ''' Inner loop results ''' print("Inner loop: best parameters set:") best = sorted(grid_scores, key=lambda x: x.mean_validation_score,reverse=True)[0] best_params_ = best.parameters best_score_ = best.mean_validation_score print("Best score: %0.3f" % best_score_) print("Best parameters set:") for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_params_[param_name])) # print 'Grid scores' for score in grid_scores: # print score
# pdf_tr = gaussian_kde(d_yhat_for_dkl_computation[ip].ravel()) # pdf_ts = gaussian_kde(yts_unlabelled[:, 1]) # x = np.linspace(0, 1, 100) # # en = entropy(pdf_tr(x), pdf_ts(x)) # kl[ip] = en # print auc_single_cv # print auc_all_cv if verbose: print kl print auc_single_cv.mean(axis=0) print auc_all_cv.mean() # print param, auc_cv.mean(), auc_cv.ravel() gs_1.append(_CVScoreTuple(param, auc_single_cv[:, 0].mean(), np.array(auc_single_cv[:, 0]))) gs_2.append(_CVScoreTuple(param, auc_single_cv[:, 1].mean(), np.array(auc_single_cv[:, 1]))) gs_3.append(_CVScoreTuple(param, auc_single_cv[:, 2].mean(), np.array(auc_single_cv[:, 2]))) grid_scores_all.append(_CVScoreTuple(param, auc_all_cv.mean(), np.array(auc_all_cv))) # dkl.append(kl) # print grid_scores best = sorted(grid_scores_all, key=lambda x: x.mean_validation_score, reverse=True)[0] best_params_ = best.parameters best_score_ = best.mean_validation_score # grid_search = GridSearchCV(pipe, parameters, verbose=1, n_jobs=2, cv=k_feat) # grid_search = GridSearchCV(clf, parameters, verbose=1, cv=cv, scoring='roc_auc') # grid_search.fit(Xt, yt) # print('Best features:', grid_search.best_estimator_.steps[0][1].k_feature_idx_)
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) X, y = indexable(X, y) cv = check_cv(cv, X, y, classifier=is_classifier(self.estimator)) base_estimator = clone(self.estimator) out = [_fit_and_score(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for parameters in parameter_iterable for train, test in cv] self._dask_value = value(out) out, = compute(value(out)) n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv n_samples = _num_samples(X) X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr') self.scorer_ = _deprecate_loss_and_score_funcs(self.loss_func, self.score_func, self.scoring) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) y = np.asarray(y) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)(delayed(fit_grid_point)( X, y, base_estimator, parameters, train, test, self.scorer_, self.verbose, **{ 'sample_weight': balance_weights(y[train]) }) for parameters in parameter_iterable for train, test in cv) # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, parameters, this_n_test_samples in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append( _CVScoreTuple(parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, sample_weight=balance_weights(y), **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def grid_search_early_stopping(estimator, param_grid, verbose, scoring, cv, X, y, early_stopping_rounds, eval_set_size, n_jobs=1, iid=True, refit=True, pre_dispatch='2*n_jobs', error_score='raise'): ''' This is from scikit-learn package. ''' parameter_iterable = ParameterGrid(param_grid) scorer_ = check_scoring(estimator, scoring=scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(estimator) pre_dispatch = pre_dispatch out = Parallel( n_jobs=n_jobs, verbose=2 if verbose > 0 else 0, pre_dispatch=pre_dispatch)(delayed(_fit_and_score)( clone(base_estimator), X, y, scorer_, train, test, 2 if verbose > 0 else 0, parameters, { "early_stopping_rounds": early_stopping_rounds, "eval_metric": get_xgboost_eval_metric(scoring), "eval_set": [_safe_split(estimator, X, y, test, train)], "verbose": True if verbose > 1 else False }, return_parameters=True, error_score=error_score) for parameters in parameter_iterable for train, test in cv) # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append( _CVScoreTuple(parameters, score, np.array(all_scores))) # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] best_score_ = best.mean_validation_score if refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params(**best.parameters) if y is not None: best_estimator, _, _ = fit_estimator_early_stopping( best_estimator, X, y, scoring, early_stopping_rounds, eval_set_size, verbose) else: raise ValueError('y is required.') return best_estimator, best.parameters, grid_scores
def _fit(self, X, y, sample_weight, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv n_samples = _num_samples(X) X, y, sample_weight = check_arrays(X, y, sample_weight, allow_lists=True, sparse_format='csr') if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) y = np.asarray(y) if sample_weight is not None: sample_weight = np.asarray(sample_weight) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print( "Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) # first fit at each grid point using the maximum n_estimators param_grid = self.param_grid.copy() param_grid['n_estimators'] = [self.max_n_estimators] grid = ParameterGrid(param_grid) pre_dispatch = self.pre_dispatch clfs = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)(delayed(fit_grid_point)( base_estimator, clf_params, X, y, sample_weight, train, test, self.verbose, **self.fit_params) for clf_params in grid for train, test in cv) # now use the already fitted ensembles but trancate to N estimators for # N from 1 to n_estimators_max - 1 (inclusive) out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)( delayed(score_each_boost) (clf, clf_params, self.min_n_estimators, X, y, sample_weight, self.score_func, train, test, self.verbose) for clf, clf_params, train, test in clfs) out = reduce(operator.add, [zip(*stage) for stage in out]) # out is now a list of triplet: score, estimator_params, n_test_samples n_estimators_points = self.max_n_estimators - self.min_n_estimators + 1 n_fits = len(out) n_folds = len(cv) grid_scores = list() for block in range(0, n_fits, n_folds * n_estimators_points): for grid_start in range(block, block + n_estimators_points): n_test_samples = 0 score = 0 all_scores = list() for this_score, parameters, this_n_test_samples in \ out[grid_start: grid_start + n_folds * n_estimators_points: n_estimators_points]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples score += this_score n_test_samples += this_n_test_samples if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) grid_scores.append( _CVScoreTuple(parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: fit_params = self.fit_params if sample_weight is not None: fit_params = fit_params.copy() fit_params['sample_weight'] = sample_weight # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **fit_params) else: best_estimator.fit(X, **fit_params) self.best_estimator_ = best_estimator return self
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv n_samples = _num_samples(X) X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr') self.scorer_ = _deprecate_loss_and_score_funcs( self.loss_func, self.score_func, self.scoring) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) y = np.asarray(y) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)( delayed(fit_grid_point_extended)( X, y, base_estimator, parameters, train, test, self.scorer_, self.verbose, **self.fit_params) for parameters in parameter_iterable for train, test in cv) # out = [] # for parameters in parameter_iterable: # fold = 1 # for train, test in cv: # print "Processing fold", fold, self.fit_params # out.append(fit_grid_point_extended(X, y, base_estimator, parameters, train, test, self.scorer_, self.verbose, **self.fit_params)) # fold += 1 # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_extras = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] all_extras = list() for this_score, parameters, this_n_test_samples, extra in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) all_extras.append(extra) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) grid_extras.append(all_extras) # Store the computed scores self.grid_scores_ = grid_scores self.extras_ = grid_extras # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _fit(self, X, y, sample_weight, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv n_samples = _num_samples(X) X, y, sample_weight = check_arrays(X, y, sample_weight, allow_lists=True, sparse_format='csr') if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) y = np.asarray(y) if sample_weight is not None: sample_weight = np.asarray(sample_weight) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) # first fit at each grid point using the maximum n_estimators param_grid = self.param_grid.copy() param_grid['n_estimators'] = [self.max_n_estimators] grid = ParameterGrid(param_grid) pre_dispatch = self.pre_dispatch clfs = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )( delayed(fit_grid_point)(base_estimator, clf_params, X, y, sample_weight, train, test, self.verbose, **self.fit_params) for clf_params in grid for train, test in cv) # now use the already fitted ensembles but trancate to N estimators for # N from 1 to n_estimators_max - 1 (inclusive) out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )( delayed(score_each_boost)(clf, clf_params, self.min_n_estimators, X, y, sample_weight, self.score_func, train, test, self.verbose) for clf, clf_params, train, test in clfs) out = reduce(operator.add, [zip(*stage) for stage in out]) # out is now a list of triplet: score, estimator_params, n_test_samples n_estimators_points = self.max_n_estimators - self.min_n_estimators + 1 n_fits = len(out) n_folds = len(cv) grid_scores = list() for block in range(0, n_fits, n_folds * n_estimators_points): for grid_start in range(block, block + n_estimators_points): n_test_samples = 0 score = 0 all_scores = list() for this_score, parameters, this_n_test_samples in \ out[grid_start: grid_start + n_folds * n_estimators_points: n_estimators_points]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples score += this_score n_test_samples += this_n_test_samples if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: fit_params = self.fit_params if sample_weight is not None: fit_params = fit_params.copy() fit_params['sample_weight'] = sample_weight # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **fit_params) else: best_estimator.fit(X, **fit_params) self.best_estimator_ = best_estimator return self