def cross_val_score_fn(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs'): """ Evaluate a score by cross-validation. This overrides the cross_val_score method typically found in cross_validation.py. Changes are clearly marked in comments, but the main change is augmenting the function to store Fit and Metric Events for each fold. """ X, y = indexable(X, y) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) scorer = check_scoring(estimator, scoring=scoring) # Default scoring scheme is 'accuracy' unless provided by user. if scoring is None: scoring = 'accuracy' # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) # Change from original scikit code: adding a new argument, scoring, to the # _fit_and_score function to track scoring function and create # MetricEvents. scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer, train, test, verbose, None, fit_params, scoring) for train, test in cv) return np.array(scores)[:, 0]
def fit(self, X, y=None, groups=None, **fit_params): # pylint: disable=unbalanced-tuple-unpacking X, y, groups = indexable(X, y, groups) # Evaluate candidates. scorers = { "score": check_scoring(self.estimator, scoring=self.scoring) } candidate_params = list( ParameterSampler(self.param_distributions, self.n_iter, random_state=self.random_state)) self.cv_results_ = self.evaluate_candidates(X, y, groups, candidate_params, scorers, fit_params) # Get the best parameter. self.best_index_ = self.cv_results_["rank_test_score"].argmin() self.best_score_ = self.cv_results_["mean_test_score"][ self.best_index_] self.best_params_ = self.cv_results_["params"][self.best_index_] # Refit. if self.refit: best_estimator = clone( self.estimator).set_params(**self.best_params_) if y is not None: best_estimator.fit(X, y, **fit_params) else: best_estimator.fit(X, **fit_params) self.best_estimator_ = best_estimator return self
def fit_and_save(estimator, X, y=None, groups=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', return_train_score=True, parameters=dict(), uuid='', url='http://127.0.0.1:8000'): import json, requests, numpy from sklearn.model_selection._validation import cross_validate X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) scorers, _ = _check_multimetric_scoring(estimator, scoring=scoring) _base_scores = [0. for _ in range(cv.get_n_splits(X, y, groups))] cv_score = {} cv_score.update( {'train_%s' % s: numpy.array(_base_scores) for s in scorers}) cv_score.update( {'test_%s' % s: numpy.array(_base_scores) for s in scorers}) cv_score.update({'fit_time': _base_scores, 'score_time': _base_scores}) try: cv_score = cross_validate(estimator, X, y, groups, scorers, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score) error = None except Exception as e: error = '{}: {}'.format(type(e).__name__, str(e)) try: for k, v in cv_score.items(): if type(v) == type(numpy.array([])): cv_score[k] = v.tolist() response = requests.post('{url}/grids/{uuid}/results'.format( url=url, uuid=uuid), data={ 'gridsearch': uuid, 'params': json.dumps(parameters), 'errors': error, 'cv_data': json.dumps(cv_score) }) except requests.exceptions.ConnectionError as e: response = None if response is None: return return response
def fit(self, X, y=None, *, groups=None, **fit_params): """Run fit with all sets of parameters. Parameters ---------- X : array-like of shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like of shape (n_samples, n_output) \ or (n_samples,), default=None Target relative to X for classification or regression; None for unsupervised learning. groups : array-like of shape (n_samples,), default=None Group labels for the samples used while splitting the dataset into train/test set. Only used in conjunction with a "Group" :term:`cv` instance (e.g., :class:`~sklearn.model_selection.GroupKFold`). **fit_params : dict of str -> object Parameters passed to the ``fit`` method of the estimator """ estimator = self.estimator cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) scorers, self.multimetric_ = _check_multimetric_scoring( self.estimator, scoring=self.scoring) X, y, groups = indexable(X, y, groups) self._run_search(X, y, cv) return self
def split(self, y, exogenous=None): """Generate indices to split data into training and test sets Parameters ---------- y : array-like or iterable, shape=(n_samples,) The time-series array. exogenous : array-like, shape=[n_obs, n_vars], optional (default=None) An optional 2-d array of exogenous variables. Yields ------ train : np.ndarray The training set indices for the split test : np.ndarray The test set indices for the split """ y, exog = indexable(y, exogenous) indices = np.arange(y.shape[0]) for train_index, test_index in self._iter_train_test_masks(y, exog): train_index = indices[train_index] test_index = indices[test_index] yield train_index, test_index
def fit(self, X, y, sample_weight=None, groups=None, missing=None, cat_cols=None, n_trials=10, timeout_per_estimator=None): # TODO check that y is regression and not classification # TODO: consider log-transform y? X, y, groups = indexable(X, y, groups) y = np.array(y) y_mean = np.mean(y) cv = check_cv(self.cv, y, classifier=False) if cv.random_state is None: cv.random_state = self.random_state # if self.sampler.seed is None: # self.sampler.seed = self.random_state scorer, scorer_type, greater_is_better = get_scorer_type(self.scoring) valid_estimators = get_estimators(self.frameworks, self.model_types, objective_type="regression") #valid_estimators = filter_estimators(X, valid_estimators, y_mean, "regression") self.run_study(X, y, valid_estimators, cv, scorer, scorer_type, greater_is_better, y_stats=y_mean, objective_type="regression", sample_weight=sample_weight, groups=groups, missing=missing, cat_cols=cat_cols, timeout_per_estimator=timeout_per_estimator, n_trials=n_trials) self.save_results() if self.refit_: self.best_pipeline_.fit(X, y)
def _gen_train_val(self, X, y, cv_split_method): X, y, groups = indexable(X, y, None) Xs_tr, ys_tr, Xs_cv, ys_cv = [], [], [], [] if isinstance(cv_split_method, BaseCrossValidator): for tr, cv in cv_split_method.split(X, y, groups): X_tr, y_tr = _safe_split(self, X, y, tr) X_cv, y_cv = _safe_split(self, X, y, cv, tr) Xs_tr.append(X_tr) Xs_cv.append(X_cv) ys_tr.append(y_tr) ys_cv.append(y_cv) elif cv_split_method.__name__ == 'train_test_split': X, X_val, y, y_val = train_test_split( X, y, random_state=self._random_state, test_size=self.validation_fraction) Xs_tr.append(X_tr) Xs_cv.append(X_cv) ys_tr.append(y_tr) ys_cv.append(y_cv) else: raise ValueError("Split method should be a " "sklearn.model_selection spliter class...") return Xs_tr, ys_tr, Xs_cv, ys_cv
def _three_way_split(splitter: KFold, X, y: Optional = None, groups: Optional = None) -> Generator: """A modified version of BaseCrossValidator.split(). Yields (K-2/1/1) train/val/test splits. """ X, y, groups = indexable(X, y, groups) indices = np.arange(_num_samples(X)) test_masks_it = splitter._iter_test_masks(X, y, groups) first_mask = last_mask = next(test_masks_it) for test_mask in test_masks_it: train_index = indices[np.logical_not( np.logical_or(test_mask, last_mask))] val_index = indices[last_mask] test_index = indices[test_mask] yield train_index, val_index, test_index last_mask = test_mask # last fold test_mask = first_mask train_index = indices[np.logical_not( np.logical_or(test_mask, last_mask))] val_index = indices[last_mask] test_index = indices[test_mask] yield train_index, val_index, test_index
def split(self, y, X=None, **kwargs): # TODO: remove kwargs """Generate indices to split data into training and test sets Parameters ---------- y : array-like or iterable, shape=(n_samples,) The time-series array. X : array-like, shape=[n_obs, n_vars], optional (default=None) An optional 2-d array of exogenous variables. Yields ------ train : np.ndarray The training set indices for the split test : np.ndarray The test set indices for the split """ # Temporary shim until we remove `exogenous` support completely X, _ = pm_compat.get_X(X, **kwargs) y, X = indexable(y, X) indices = np.arange(y.shape[0]) for train_index, test_index in self._iter_train_test_masks(y, X): train_index = indices[train_index] test_index = indices[test_index] yield train_index, test_index
def split(self, X, y=None, groups=None): X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) indices = np.arange(n_samples) n_splits = self.n_splits fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=np.int) fold_sizes[:n_samples % n_splits] += 1 current = 0 for fold_size in fold_sizes: start, stop = current, current + fold_size test_index = indices[start:stop] start_pad = start - self.n_reduce stop_pad = stop + self.n_reduce if start_pad < 0: start_pad = 0 if stop_pad > n_samples: stop_pad = n_samples block_index = indices[start_pad:stop_pad] block_mask = np.zeros(n_samples, dtype=np.bool) block_mask[block_index] = True train_index = indices[np.logical_not(block_mask)] yield train_index, test_index current = stop
def fit(self, X, y): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv n_samples = _num_samples(X) X, y = indexable(X, y) param_grid = ParameterSampler(self.param_distributions, self.n_iter, random_state=self.random_state) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(param_grid, Sized): n_candidates = len(param_grid) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) indexed_param_grid = list(zip(range(len(param_grid)), param_grid)) par_param_grid = self.sc.parallelize(indexed_param_grid, len(indexed_param_grid)) X_bc = self.sc.broadcast(X) y_bc = self.sc.broadcast(y) scorer = self.scorer_ indexed_output = dict( par_param_grid.map( lambda i: local_fit(i[0], i[1], base_estimator, X_bc.value, y_bc.value, scorer, cv)).collect()) out = [indexed_output[idx] for idx in range(len(param_grid))] X_bc.unpersist() y_bc.unpersist() best = sorted(out, key=lambda x: x[0], reverse=True)[0] self.best_params_ = best[1] self.best_score_ = best[0] if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params(**best[1]) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def fit(self, X, y, groups=None): """Actual fitting, performing the search over parameters.""" results = dict() best_index = None best_parameters = None for bracket_idx in range(self.num_brackets - 1, -1, -1): successive_halving_steps = bracket_idx + 1 # TODO: num_arms should be different estimator = self.estimator cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) X, y, groups = indexable(X, y, groups) n_splits = cv.get_n_splits(X, y, groups) base_estimator = clone(self.estimator) arms_pulled = 0 if 'mean_test_score' in results: arms_pulled = len(results['mean_test_score']) res = self._successive_halving(X, y, groups, cv, self.eta, successive_halving_steps - 1, self.num_brackets - 1) bracket_results, bracket_best_index, bracket_best_parameters = res for key, values in bracket_results.items(): if key not in results: results[key] = values else: results[key] = np.append(results[key], values) if best_index is None: best_index = bracket_best_index + arms_pulled best_parameters = bracket_best_parameters elif bracket_results['mean_test_score'][ bracket_best_index] > results['mean_test_score'][ best_index]: best_index = bracket_best_index + arms_pulled best_parameters = bracket_best_parameters self.cv_results_ = results self.best_index_ = best_index self.n_splits_ = n_splits if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best_parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. :param X: array-like of shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. :param y: array-like of shape (n_samples,) Always ignored, exists for compatibility. :param groups: array-like of shape (n_samples,) Always ignored, exists for compatibility. :returns: train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """ if groups is None: raise ValueError("The 'groups' parameter should not be None.") X, y, groups = indexable(X, y, groups) groups = check_array(groups, ensure_2d=False, dtype=None) unique_groups, groups = np.unique(groups, return_inverse=True) n_samples_per_group = np.bincount(groups) n_groups = len(unique_groups) n_samples = _num_samples(X) n_splits = self.n_splits n_folds = n_splits + 1 if self.n_splits > n_groups: raise ValueError("Cannot have number of splits n_splits=%d greater" " than the number of groups: %d." % (self.n_splits, n_groups)) indices = np.arange(n_samples) test_size = (n_groups // n_folds) test_starts = range(test_size + n_groups % n_folds, n_groups, test_size) for test_start in test_starts: # here we already have groups after inverse operation # and don't need to use unique_group if self.max_train_size: # find number of group for start not to overflow train size sizes = n_samples_per_group[:test_start][::-1].cumsum() appropriate_indices = np.where(sizes <= self.max_train_size)[0] if appropriate_indices.size == 0: train_start = max(test_start - 1, 0) else: train_start = test_start - appropriate_indices.max() - 1 yield (indices[(groups < test_start) & (groups >= train_start)], indices[(groups >= test_start) & (groups < test_start + test_size)]) else: yield (indices[groups < test_start], indices[(groups >= test_start) & (groups < test_start + test_size)])
def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) The target variable for supervised learning problems. groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. Returns ------- train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """ if groups is not None: # find all indices that are at the beginning of a group groups_unique = np.unique(groups) possible_test_start = [ np.where(i == groups)[0][0] for i in np.nditer(groups_unique) ] possible_test_start = np.asarray(possible_test_start) X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) n_splits = self.n_splits n_folds = n_splits + 1 if n_folds > n_samples: raise ValueError(("Cannot have number of folds ={0} greater" " than the number of samples: {1}.").format( n_folds, n_samples)) indices = np.arange(n_samples) test_size = (n_samples // n_folds) test_starts = range(test_size + n_samples % n_folds, n_samples, test_size) if groups is not None: # find all possible starts that are closest to predefined test_starts test_starts = [ possible_test_start[np.abs(possible_test_start - i).argmin()] for i in test_starts ] for test_start in test_starts: yield (indices[:test_start], indices[test_start:test_start + test_size])
def split(self, X, y=None, groups=None): X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) train_size = round((1 - self.test_size) * n_samples) train_index = np.arange(train_size - self.n_reduce) test_index = np.arange(train_size, n_samples) yield train_index, test_index
def fit(self, X, y): """Actual fitting, performing the search over parameters.""" parameter_iterable = ParameterSampler(self.param_distributions, self.n_iter, random_state=self.random_state) estimator = self.estimator cv = self.cv n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )( delayed(cv_fit_and_score)(clone(base_estimator), X, y, self.scoring, parameters, cv=cv) for parameters in parameter_iterable) best = sorted(out, reverse=True)[0] self.best_params_ = best[1] self.best_score_ = best[0] if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best[1]) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def cross_val_score(estimator, X, y=None, fold_specific_X_extractor=None, groups=None, scorings=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs'): """ :param estimator: :param X: :param y: :param fold_specific_X_extractor: :param groups: :param scorings: list of scorings (strings, callables, etc...) :param cv: :param n_jobs: :param verbose: :param fit_params: :param pre_dispatch: :return: an array of scores, shape: <folds x scores> """ X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) scorers = [ check_scoring(estimator, scoring=scoring) for scoring in scorings ] # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) scores = parallel( delayed(_fe_fit_and_score)( clone(estimator), X, y, scorers, train, test, verbose, None, fit_params, fold_specific_X_extractor=fold_specific_X_extractor) for train, test in cv.split(X, y, groups)) # here scores is python list of shape <folds x 1 x scores> scores = np.array(scores) # eliminate middle axis return scores.reshape((scores.shape[0], scores.shape[2]))
def __init__(self, X, generator, y=None, batch_size=32, shuffle=True, sample_weight=None, seed=None): X, y, sample_weight = indexable(X, y, sample_weight) self.X = X self.generator = generator self.y = y self.sample_weight = sample_weight super(FastaToArrayIterator, self).__init__(X.shape[0], batch_size, shuffle, seed)
def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. Parameters ---------- X : array-like, of length n_samples Training data, includes reaction's containers y : array-like, of length n_samples The target variable for supervised learning problems. groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """ X, y, groups = indexable(X, y, groups) cgr = CGRpreparer() cgrs = [cgr.condense(r) for r in X] structure_condition = defaultdict(set) for structure, condition in zip(cgrs, groups): structure_condition[structure].add(condition) train_data = defaultdict(list) test_data = [] for n, (structure, condition) in enumerate(zip(cgrs, groups)): train_data[condition].append(n) if len(structure_condition[structure]) > 1: test_data.append(n) for condition, indexes in train_data.items(): test_index = [index for index in indexes if index in test_data] if len(test_index) == 0: continue train_index = [] for c in train_data.keys(): if not c == condition: train_index.extend(train_data[c]) yield array(train_index), array(test_index)
def split(self, X, y=None, groups=None): X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) indices = np.arange(n_samples) argsort = np.argsort(groups) for i in range(self.n_splits): train_index = indices[argsort[i::self.n_splits]] train_mask = np.zeros(n_samples, dtype=np.bool) train_mask[train_index] = True test_index = indices[np.logical_not(train_mask)] yield train_index, test_index
def split(self, X, y, erry=10e-4, groups=None, match_window=np.inf, num_pairs=None, closest_match=False, random_state=None): X, y, groups = indexable(X, y, groups) num_samples = _num_samples(X) if num_samples < 2: raise ValueError( 'Number of samples must be greater than or equal to 2.') return self.generate_train_test(X, y, erry, groups, match_window, num_pairs, closest_match, random_state)
def fit(self, X, y, groups=None): """Actual fitting, performing the search over parameters.""" num_arms = self.eta**(self.num_steps - 1) parameter_iterable = ParameterSampler(self.param_distributions, num_arms, random_state=self.random_state) estimator = self.estimator cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) X, y, groups = indexable(X, y, groups) n_splits = cv.get_n_splits(X, y, groups) if self.verbose > 0 and isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(n_splits, n_candidates, n_candidates * n_splits)) base_estimator = clone(self.estimator) results, best_index, best_parameters = self._successive_halving( X, y, groups, cv, self.eta, self.num_steps - 1, self.num_steps - 1) self.cv_results_ = results self.best_index_ = best_index self.n_splits_ = n_splits if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best_parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Always ignored, exists for compatibility. groups : array-like, with shape (n_samples,), optional Always ignored, exists for compatibility. Returns ------- train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """ X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) initial_train_index = max( 0, _get_size(n_samples, self.initial_train_index, neg_mode='subtract')) final_index = n_samples test_size = _get_size(n_samples - initial_train_index, self.test_size, neg_mode='subtract') indices = np.arange(n_samples) train = indices[initial_train_index:-test_size] test = indices[-test_size:final_index] yield train, test
def split(self, X, y=None, groups=None): X, y, groups = indexable(X, y, groups) indices = np.arange(_num_samples(X)) X_copy = X.copy() X_copy.insert(0, "idx", indices) for test_index in self._iter_test_masks(X_copy, y, "pid"): train_index = indices[np.logical_not(test_index)] test_index = indices[test_index] discard_train_index = [] # exclude days after test for specific participant for pid in set(X_copy.iloc[test_index].index.get_level_values("pid").tolist()): participant_in_train = X_copy.iloc[train_index][X_copy.iloc[train_index].index.get_level_values("pid") == pid] participant_in_test = X_copy.iloc[test_index][X_copy.iloc[test_index].index.get_level_values("pid") == pid] last_date_in_test = participant_in_test.index.max() discard_train_index.extend(participant_in_train[participant_in_train.index >= pd.Index([last_date_in_test]*len(participant_in_train))]["idx"].tolist()) train_index = train_index[~np.isin(train_index, discard_train_index)] yield train_index, test_index
def cross_val_score_(estimators, X, y=None, groups=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None): X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=True) cv_iter = list(cv.split(X, y, groups)) parallel = Parallel(n_jobs=n_jobs, verbose=0) scores = parallel( delayed(_fit_and_score)(estimators[i], X, y, check_scoring(estimators[i], scoring=scoring), train, test, verbose, None, fit_params) for i, (train, test) in enumerate(cv_iter)) return np.array(scores)[:, 0]
def to_indexable(*args, **kwargs): """Ensure that all args are an indexable type. Conversion runs lazily for dask objects, immediately otherwise. Parameters ---------- args : array_like or scalar allow_scalars : bool, optional Whether to allow scalars in args. Default is False. """ if kwargs.get('allow_scalars', False): indexable = _maybe_indexable else: indexable = _indexable for x in args: if x is None or isinstance(x, da.Array): yield x elif is_dask_collection(x): yield delayed(indexable, pure=True)(x) else: yield indexable(x)
def to_indexable(*args, **kwargs): """Ensure that all args are an indexable type. Conversion runs lazily for dask objects, immediately otherwise. Parameters ---------- args : array_like or scalar allow_scalars : bool, optional Whether to allow scalars in args. Default is False. """ if kwargs.get("allow_scalars", False): indexable = _maybe_indexable else: indexable = _indexable for x in args: if x is None or isinstance(x, da.Array): yield x elif is_dask_collection(x): yield delayed(indexable, pure=True)(x) else: yield indexable(x)
def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Always ignored, exists for compatibility. groups : array-like, with shape (n_samples,) Always ignored, exists for compatibility. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """ X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) indices = np.arange(n_samples) h = self.h min_position = np.maximum(h, int(n_samples * (1 - self.p_to_use))) positions = np.flip(np.arange(min_position, n_samples - h)) for position in positions: yield (indices[:position], indices[position:position + h])
def split(self, X, y=None, groups=None): """ Generates indices to split training and testing data Args: X: y: Not used, exists for compatibility groups: Not used, exists for compatibility Returns: train (np.ndarray): indices for training set test (np.ndarray): indices for testing set """ X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) # len(X) n_splits = self.n_splits trainm = self.min_train_size trainM = self.max_train_size if self.max_train_size is not None else np.inf testm = self.min_test_size testM = self.max_test_size if self.max_test_size is not None else np.inf delay = self.delay if (n_samples - (trainm + delay + testm) < n_splits - 1): raise ValueError("Not enough samples") # The datum for each fold will be the index of the first test sample self.test_starts = np.linspace(trainm + delay, n_samples - testm, n_splits, dtype=int) indices = np.arange(n_samples) for test_start in self.test_starts: test_end = min(test_start + testM, n_samples) train_end = test_start - delay train_start = max(test_start - delay - trainM, 0) yield indices[train_start:train_end], indices[test_start:test_end]
def fit(self, X, y, sample_weight=None, groups=None, missing=None, cat_cols=None, n_trials=10, timeout_per_estimator=None): X, y, groups = indexable(X, y, groups) ## convertc labels to np.array le = LabelEncoder() y = le.fit_transform(y) class_counts = np.bincount(y) cv = check_cv(self.cv, y, classifier=True) if cv.random_state is None: cv.random_state = self.random_state # if self.sampler.seed is None: #TODO: check for CMA # self.sampler.seed = self.random_state scorer, scorer_type, greater_is_better = get_scorer_type(self.scoring) data_tags = get_data_tags(X, y, "classification", class_counts) #get estimators ("name", tags, class) by insatlled packages + version #filter estimators by data & constraints valid_estimators = get_estimators(self.frameworks, self.model_types, objective_type="classification") #valid_estimators = filter_estimators(X, valid_estimators, class_counts, "classification") self.run_study(X, y, valid_estimators, cv, scorer, scorer_type, greater_is_better, y_stats=class_counts, objective_type="classification", sample_weight=sample_weight, groups=groups, missing=missing, cat_cols=cat_cols, timeout_per_estimator=timeout_per_estimator, n_trials=n_trials) self.save_results() if self.refit_: self.best_pipeline_.fit(X, y)
def fit(self, X, y=None, labels=None): #return self._fit( # X, y, labels, # parameter_iterable # parameter_iterable, \in Sized, it actually does len(parameter_iterable) in _fit #) # FIXME code duplication from BaseSearchCV._fit estimator = self.estimator cv = _split.check_cv(self.cv, y, classifier=is_classifier(estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y, labels = indexable(X, y, labels) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) n_splits = cv.get_n_splits(X, y, labels) if self.verbose > 0 and isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(n_splits, n_candidates, n_candidates * n_splits)) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch # FIXME how to handle pre_dispatch # FIXME recursively getting new parameters to evaluate # parameter_iterable = ... # the magic # # # The evaluation (Parallel) stuff # out = Parallel( # n_jobs=self.n_jobs, verbose=self.verbose, # pre_dispatch=pre_dispatch # )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, # train, test, self.verbose, parameters, # self.fit_params, return_parameters=True, # error_score=self.error_score) # for parameters in parameter_iterable # for train, test in cv.split(X, y, labels)) # # n_fits on each (train, test) def cross_validation(raw_parameters): parameters = dict(zip( self.param_grid.keys(), raw_parameters )) # TODO more robust way of doing this print(parameters) return Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for train, test in cv.split(X, y, labels)) x = cartesian_product(*self.param_grid.values()) # FIXME implement as non-recursive def bo_(x_obs, y_obs, n_iter): if n_iter > 0: kernel = kernels.Matern() + kernels.WhiteKernel() gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=16) gp.fit(x_obs, 1-y_obs) a = a_EI(gp, x_obs=x_obs, y_obs=1-y_obs) argmax_f_x_ = x[np.argmax(a(x))] # heavy evaluation f_argmax_f_x_ = cross_validation(argmax_f_x_) y_ob = np.atleast_2d(mean_mean_validation_scores(f_argmax_f_x_)).T return f_argmax_f_x_ + bo_( x_obs=np.vstack((x_obs, argmax_f_x_)), y_obs=np.vstack((y_obs, y_ob)), n_iter=n_iter-1, ) else: return [] # FIXME (most informative) decision like Numerical Probabilistics stuff for integrations # sobol initilization? sampled_x_ind = np.random.choice( x.shape[0], size=self.n_initial_points, replace=False, ) print(sampled_x_ind) x_obs = x[sampled_x_ind] f_x_obs = list(map(cross_validation, x_obs)) y_obs = np.atleast_2d(list(map(mean_mean_validation_scores, f_x_obs))).T out = sum(f_x_obs, []) + bo_(x_obs, y_obs, n_iter=self.n_iter) n_fits = len(out) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_splits): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _ , parameters in \ out[grid_start:grid_start + n_splits]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_splits) scores.append((score, parameters)) grid_scores.append(_search._CVScoreTuple( parameters, score, np.array(all_scores))) self.grid_scores_ = grid_scores best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _fit(self, X, y, groups=None, parameter_iterable=None, **fit_params): if groups is not None: raise NotImplementedError('The groups argument is not supported.') if parameter_iterable is not None: raise NotImplementedError('The parameter_iterable argument is not supported.') if self.fit_params is not None: fit_params = self.fit_params # Actual fitting, performing the search over parameters. estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) n_folds, cv_iter = our_check_cv(cv, X, y, classifier=is_classifier(estimator)) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch # setup SigOpt experiment and run optimization self._create_sigopt_exp(self.sigopt_connection) # start tracking time to optimize estimator opt_start_time = time.time() for jk in range(0, self.n_iter, self.n_sug): # check for opt timeout, ensuring at least 1 observation # TODO : handling failure observations if ( self.opt_timeout is not None and time.time() - opt_start_time > self.opt_timeout and jk >= 1 ): # break out of loop and refit model with best params so far break suggestions = [] parameter_configs = [] for _ in range(self.n_sug): suggestion = self.sigopt_connection.experiments(self.experiment.id).suggestions().create() parameters = self._convert_sigopt_api_to_sklearn_assignments(suggestion.assignments.to_json()) suggestions.append(suggestion) parameter_configs.append(parameters) if self.verbose > 0: print('Evaluating params : ', parameter_configs) # do CV folds in parallel using joblib # returns scores on test set obs_timed_out = False try: par_kwargs = {'n_jobs': self.n_jobs, 'verbose': self.verbose, 'pre_dispatch': pre_dispatch} # add timeout kwarg if version of joblib supports it if 'timeout' in getfullargspec(Parallel.__init__).args: par_kwargs['timeout'] = self.cv_timeout out = Parallel( **par_kwargs )( delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, fit_params, return_parameters=True, error_score=self.error_score) for parameters in parameter_configs for train, test in cv_iter) except TimeoutError: obs_timed_out = True if not obs_timed_out: # grab scores from results for sidx, suggestion in enumerate(suggestions): out_idx = sidx * n_folds scores = [o[0] for o in out[out_idx:out_idx+n_folds]] self.sigopt_connection.experiments(self.experiment.id).observations().create( suggestion=suggestion.id, value=numpy.mean(scores), value_stddev=numpy.std(scores) ) else: # obsevation timed out so report a failure self.sigopt_connection.experiments(self.experiment.id).observations().create( suggestion=suggestion.id, failed=True) # return best SigOpt assignments so far best_assignments = self.sigopt_connection.experiments(self.experiment.id).best_assignments().fetch().data if not best_assignments: raise RuntimeError( 'No valid observations found. ' 'Make sure opt_timeout and cv_timeout provide sufficient time for observations to be reported.') self.our_best_params_ = self._convert_sigopt_api_to_sklearn_assignments( best_assignments[0].assignments.to_json()) self.our_best_score_ = best_assignments[0].value if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params(**self.best_params_) if y is not None: best_estimator.fit(X, y, **fit_params) else: best_estimator.fit(X, **fit_params) self.our_best_estimator_ = best_estimator return self
def _fit(self, X, y, groups, parameter_iterable): estimator = self.estimator cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) X, y, groups = indexable(X, y, groups) n_splits = cv.get_n_splits(X, y, groups) if self.verbose > 0 and isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(n_splits, n_candidates, n_candidates * n_splits)) base_estimator = clone(self.estimator) param_grid = [(parameters, train, test) for parameters in parameter_iterable for train, test in list(cv.split(X, y, groups))] # Because the original python code expects a certain order for the elements, we need to # respect it. indexed_param_grid = list(zip(range(len(param_grid)), param_grid)) par_param_grid = self.sc.parallelize(indexed_param_grid, len(indexed_param_grid)) X_bc = self.sc.broadcast(X) y_bc = self.sc.broadcast(y) scorer = self.scorer_ verbose = self.verbose error_score = self.error_score fit_params = self.fit_params return_train_score = self.return_train_score fas = _fit_and_score def fun(tup): (index, (parameters, train, test)) = tup local_estimator = clone(base_estimator) local_X = X_bc.value local_y = y_bc.value res = fas(local_estimator, local_X, local_y, scorer, train, test, verbose, parameters, fit_params, return_train_score=return_train_score, return_n_test_samples=True, return_times=True, return_parameters=True, error_score=error_score) return (index, res) indexed_out0 = dict(par_param_grid.map(fun).collect()) out = [indexed_out0[idx] for idx in range(len(param_grid))] if return_train_score: (train_scores, test_scores, test_sample_counts, fit_time, score_time, parameters) = zip(*out) else: (test_scores, test_sample_counts, fit_time, score_time, parameters) = zip(*out) X_bc.unpersist() y_bc.unpersist() candidate_params = parameters[::n_splits] n_candidates = len(candidate_params) results = dict() def _store(key_name, array, weights=None, splits=False, rank=False): """A small helper to store the scores/times to the cv_results_""" # When iterated first by splits, then by parameters array = np.array(array, dtype=np.float64).reshape(n_candidates, n_splits) if splits: for split_i in range(n_splits): results["split%d_%s" % (split_i, key_name)] = array[:, split_i] array_means = np.average(array, axis=1, weights=weights) results['mean_%s' % key_name] = array_means # Weighted std is not directly available in numpy array_stds = np.sqrt(np.average((array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights)) results['std_%s' % key_name] = array_stds if rank: results["rank_%s" % key_name] = np.asarray( rankdata(-array_means, method='min'), dtype=np.int32) # Computed the (weighted) mean and std for test scores alone # NOTE test_sample counts (weights) remain the same for all candidates test_sample_counts = np.array(test_sample_counts[:n_splits], dtype=np.int) _store('test_score', test_scores, splits=True, rank=True, weights=test_sample_counts if self.iid else None) if self.return_train_score: _store('train_score', train_scores, splits=True) _store('fit_time', fit_time) _store('score_time', score_time) best_index = np.flatnonzero(results["rank_test_score"] == 1)[0] best_parameters = candidate_params[best_index] # Use one MaskedArray and mask all the places where the param is not # applicable for that candidate. Use defaultdict as each candidate may # not contain all the params param_results = defaultdict(partial(MaskedArray, np.empty(n_candidates,), mask=True, dtype=object)) for cand_i, params in enumerate(candidate_params): for name, value in params.items(): # An all masked empty array gets created for the key # `"param_%s" % name` at the first occurence of `name`. # Setting the value at an index also unmasks that index param_results["param_%s" % name][cand_i] = value results.update(param_results) # Store a list of param dicts at the key 'params' results['params'] = candidate_params self.cv_results_ = results self.best_index_ = best_index self.n_splits_ = n_splits if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best_parameters) if y is not None: best_estimator.fit(X, y, **fit_params) else: best_estimator.fit(X, **fit_params) self.best_estimator_ = best_estimator return self
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) param_grid = [(parameters, train, test) for parameters in parameter_iterable for (train, test) in cv] # Because the original python code expects a certain order for the elements, we need to # respect it. indexed_param_grid = list(zip(range(len(param_grid)), param_grid)) par_param_grid = self.sc.parallelize(indexed_param_grid, len(indexed_param_grid)) X_bc = self.sc.broadcast(X) y_bc = self.sc.broadcast(y) scorer = self.scorer_ verbose = self.verbose fit_params = self.fit_params error_score = self.error_score fas = _fit_and_score def fun(tup): (index, (parameters, train, test)) = tup local_estimator = clone(base_estimator) local_X = X_bc.value local_y = y_bc.value res = fas(local_estimator, local_X, local_y, scorer, train, test, verbose, parameters, fit_params, return_parameters=True, error_score=error_score) return (index, res) indexed_out0 = dict(par_param_grid.map(fun).collect()) out = [indexed_out0[idx] for idx in range(len(param_grid))] X_bc.unpersist() y_bc.unpersist() # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _fit(self, X, y, parameter_iterable, en_celery=False): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch if en_celery: out = [] timestamp = timestamp = datetime.now().strftime("%Y%m%d%H%M%s") key = "sample_%s_%s" % (timestamp, int(round(random.random(), 8)*1e8)) red.set(key, pickle.dumps({'X': X, 'y': y})) grp = group(cjobs.fas_mp.s(clone(base_estimator), key, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for parameters in parameter_iterable for train, test in cv)() out = grp.get() red.delete(key) else: out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )( delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for parameters in parameter_iterable for train, test in cv) # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _indexable(x): return indexable(x)[0]
def fit(self, X, y=None, x_is_index=False, X_name='X', y_name='y'): parameter_iterable = ParameterGrid(self.param_grid) """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) # out = Parallel( # n_jobs=self.n_jobs, verbose=self.verbose, # pre_dispatch=pre_dispatch # )( # delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, # train, test, self.verbose, parameters, # self.fit_params, return_parameters=True, # error_score=self.error_score) # for parameters in parameter_iterable # for train, test in cv) train_test_parameters = ((train, test, parameters) \ for parameters in parameter_iterable for train, test in cv) length = len(parameter_iterable) * len(cv) if x_is_index: X_to_pass = X y_to_pass = None else: X_to_pass = None y_to_pass = None self.view.block = False # print('sequences') # sequences = [ # train_test_parameters, # [clone(base_estimator)] * length, # [X_to_pass] * length, # [y_to_pass] * length, # [self.verbose] * length, # [self.fit_params] * length, # [True] * length, # [self.scorer_] * length, # [x_is_index] * length, # ] f = partial(my_fit_and_score, estimator=clone(base_estimator), X=X_to_pass, y=y_to_pass, verbose=self.verbose, fit_params=self.fit_params, return_parameters=True, scorer=None, x_is_index=x_is_index, names=(X_name, y_name)) # print('before map') # import cProfile # # pr = cProfile.Profile() # pr.enable() chunksize = 10 out = self.view.map(f, itertools.islice(train_test_parameters, 0, length), ordered=False, block=False, chunksize=chunksize) # length / len(self.view)) # pr.disable() # pr.print_stats('cumulative') print('map called') if self.callback is not None: old_progress = out.progress while not out.ready(): self.callback(out.progress * chunksize, length, out.elapsed) if old_progress == out.progress and out.progress > 0: for id, info in self.view.queue_status(verbose=True).iteritems(): # print(id, info) if isinstance(info, dict) and 'queue' in info and len(info['queue']) > 0: print(id, info['queue']) pass old_progress = out.progress sleep(10) print('map ready') out = out.get() # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _fit(self, X, y, parameter_dict): self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(self.cv, y=y, classifier=is_classifier(self.estimator)) creator.create("FitnessMax", base.Fitness, weights=(1.0,)) creator.create("Individual", list, est=clone(self.estimator), fitness=creator.FitnessMax) toolbox = base.Toolbox() name_values, gene_type, maxints = _get_param_types_maxint(parameter_dict) if self.gene_type is None: self.gene_type = gene_type if self.verbose: print("Types %s and maxint %s detected" % (self.gene_type, maxints)) toolbox.register("individual", _initIndividual, creator.Individual, maxints=maxints) toolbox.register("population", tools.initRepeat, list, toolbox.individual) toolbox.register("evaluate", _evalFunction, searchobj=self, name_values=name_values, X=X, y=y, scorer=self.scorer_, cv=cv, iid=self.iid, verbose=self.verbose, error_score=self.error_score, fit_params=self.fit_params) toolbox.register("mate", _cxIndividual, indpb=self.gene_crossover_prob, gene_type=self.gene_type) toolbox.register("mutate", _mutIndividual, indpb=self.gene_mutation_prob, up=maxints) toolbox.register("select", tools.selTournament, tournsize=self.tournament_size) if self.n_jobs > 1: pool = Pool(processes=self.n_jobs) toolbox.register("map", pool.map) pop = toolbox.population(n=self.population_size) hof = tools.HallOfFame(1) stats = tools.Statistics(lambda ind: ind.fitness.values) stats.register("avg", np.mean) stats.register("min", np.min) stats.register("max", np.max) if self.verbose: print('--- Evolve in {0} possible combinations ---'.format(np.prod(np.array(maxints)+1))) pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=self.generations_number, stats=stats, halloffame=hof, verbose=self.verbose) current_best_score_ = hof[0].fitness.values[0] current_best_params_ = _individual_to_params(hof[0], name_values) if self.verbose: print("Best individual is: %s\nwith fitness: %s" % ( current_best_params_, current_best_score_) ) print "Scoring evaluations: %d, Cache hits: %d, Total: %d" % ( self.num_evaluations, self.num_cache_hits, self.num_evaluations + self.num_cache_hits) if current_best_score_ > self.best_score_: self.best_score_ = current_best_score_ self.best_params_ = current_best_params_
def _maybe_indexable(x): return indexable(x)[0] if _is_arraylike(x) else x
def _extendedFit(self, X, y, parameter_iterable): estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )( delayed(_extended_fit_and_score)(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for parameters in parameter_iterable for train, test in cv) # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() grid_extras = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] all_extras = [] for this_score, this_n_test_samples, _, parameters, extra in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) all_extras.append(extra) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) grid_extras.append(all_extras) # Store the computed scores self.grid_scores_ = grid_scores self.extras_ = grid_extras # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: print "Refitting best estimator" # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _fit(self, X, y): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch # setup SigOpt experiment and run optimization self._create_sigopt_exp() for jk in xrange(self.n_iter): suggestion = self.conn.experiments(self.experiment.id).suggestions().create() parameters = suggestion.assignments.to_json() # convert all unicode names and values to plain strings non_unicode_parameters = self._convert_unicode_dict(parameters) if self.verbose > 0: print "Evaluating params : ",non_unicode_parameters # do CV folds in parallel using joblib # returns scores on test set out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )( delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, non_unicode_parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for train, test in cv) # grab scores from results scores = [o[0] for o in out] self.conn.experiments(self.experiment.id).observations().create( suggestion=suggestion.id, value=numpy.mean(scores), value_stddev=numpy.std(scores) ) # return best SigOpt observation so far best_obs = self.conn.experiments(self.experiment.id).fetch().progress.best_observation self.best_params_ = best_obs.assignments.to_json() # convert all unicode names and values to plain strings self.best_params_ = self._convert_unicode_dict(self.best_params_) self.best_score_ = best_obs.value if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **self.best_params_) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator
def _fit(self, X, y, groups, parameter_iterable): """ Actual fitting, performing the search over parameters. Taken from https://github.com/scikit-learn/scikit-learn/blob/0.18.X .../sklearn/model_selection/_search.py """ estimator = self.estimator cv = sklearn.model_selection._validation.check_cv(self.cv, y, classifier= is_classifier(estimator)) self.scorer_ = check_scoring( self.estimator, scoring=self.scoring) X, y, groups = indexable(X, y, groups) n_splits = cv.get_n_splits(X, y, groups) if self.verbose > 0 and isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(n_splits, n_candidates, n_candidates * n_splits)) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch cv_iter = list(cv.split(X, y, groups)) out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )(delayed(sklearn.model_selection._validation._fit_and_score)( clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, fit_params=self.fit_params, return_train_score=self.return_train_score, return_n_test_samples=True, return_times=True, return_parameters=True, error_score=self.error_score ) for parameters in parameter_iterable for train, test in cv_iter ) # if one choose to see train score, "out" will contain train score info if self.return_train_score: (train_scores, test_scores, test_sample_counts, fit_time, score_time, parameters) = zip(*out) else: (test_scores, test_sample_counts, fit_time, score_time, parameters) = zip(*out) candidate_params = parameters[::n_splits] n_candidates = len(candidate_params) results = dict() def _store(key_name, array, weights=None, splits=False, rank=False): """A small helper to store the scores/times to the cv_results_""" array = np.array(array, dtype=np.float64).reshape(n_candidates, n_splits) if splits: for split_i in range(n_splits): results["split%d_%s" % (split_i, key_name)] = array[:, split_i] array_means = np.average(array, axis=1, weights=weights) results['mean_%s' % key_name] = array_means # Weighted std is not directly available in numpy array_stds = np.sqrt(np.average((array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights)) results['std_%s' % key_name] = array_stds if rank: results["rank_%s" % key_name] = np.asarray( rankdata(-array_means, method='min'), dtype=np.int32) # Computed the (weighted) mean and std for test scores alone # NOTE test_sample counts (weights) remain the same for all candidates test_sample_counts = np.array(test_sample_counts[:n_splits], dtype=np.int) _store('test_score', test_scores, splits=True, rank=True, weights=test_sample_counts if self.iid else None) if self.return_train_score: _store('train_score', train_scores, splits=True) _store('fit_time', fit_time) _store('score_time', score_time) best_index = np.flatnonzero(results["rank_test_score"] == 1)[0] best_parameters = candidate_params[best_index] # Use one MaskedArray and mask all the places where the param is not # applicable for that candidate. Use defaultdict as each candidate may # not contain all the params param_results = defaultdict(partial( MaskedArray, np.empty(n_candidates,), mask=True, dtype=object)) for cand_i, params in enumerate(candidate_params): for name, value in params.items(): # An all masked empty array gets created for the key # `"param_%s" % name` at the first occurence of `name`. # Setting the value at an index also unmasks that index param_results["param_%s" % name][cand_i] = value results.update(param_results) # Store a list of param dicts at the key 'params' results['params'] = candidate_params self.cv_results_ = results self.best_index_ = best_index self.n_splits_ = n_splits if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best_parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self