def _pseudo_label(self, X): while not self.max_iter or self.n_iter_ < self.max_iter: # Select not added rows index = X.index.difference(self.X_.index) X_new = X.loc[index] if not len(index): break # Predict probabilities y_prob = self.estimator_.predict_proba(X_new) y_prob = pd.DataFrame(y_prob, index=X_new.index) y_new = y_prob.apply(lambda row: row.idxmax(), axis=1) # Mask rows with high certainty mask = (y_prob >= self.proba).any(axis=1) if not mask.any(): break # Add labeled data & fit self.partial_fit(X_new[mask], y_new[mask]) # Verbose if self.verbose: logmsg(f"ITER {self.n_iter_}: Add {mask.sum()} labels") return self.estimator_
def _log(self, msg, end=' ' * 4): if not self.verbose: return if self.compact: print(msg, end=end) else: utils.logmsg(msg) time.sleep(0.01)
def crossval(estimator, cv, X, y, groups=None, X_new=None, new_index=None, scoring=None, test_avg=True, avg_type='auto', method='predict', return_pred=True, return_estimator=False, verbose=2, n_digits=4, n_jobs=None, compact=False, train_score=False, y_transform=None, **kwargs): """Evaluate metric(s) by cross-validation and also record fit/score time, feature importances and compute out-of-fold and test predictions. Parameters ---------- estimator : estimator object The object to use to fit the data. cv : int, cross-validation generator or an iterable Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross validation, - integer, to specify the number of folds in a `(Stratified)KFold`, - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all other cases, :class:`KFold` is used. X : DataFrame, shape [n_samples, n_features] The data to fit, score and calculate out-of-fold predictions y : Series, shape [n_samples] The target variable to try to predict groups : None Group labels for the samples used while splitting the dataset into train/test set X_new : DataFrame, shape [m_samples, n_features] or None The unseed data to predict (test set) new_index : iterable or None Indices for test set if passed X_new is not DataFrames. Ignored if X_new is DataFrame or None. test_avg : bool Stacking strategy (essential parameter) - True: bagged predictions for test set (given that we have N folds, we fit N models on each fold's train data, then each model predicts test set, then we perform bagging: compute mean of predicted values (for regression or class probabilities) - or majority vote: compute mode (when predictions are class labels) - False: predictions for tests set (estimator is fitted once on full train set, then predicts test set) Ignored if return_pred=False or X_new is not defined. scoring : string, callable or None, optional, default: None A string or a scorer callable object / function with signature ``scorer(estimator, X, y)`` which should return only a single value. If None, the estimator's default scorer (if available) is used. avg_type : string, {'mean', 'soft', 'hard', 'auto', 'rank', 'pass'} (default='auto') Averaging strategy for aggregating different CV folds predictions - 'hard' : use predicted class labels for majority rule voting. Ignored if estimator type is 'regressor'. Ignored if <return_pred> set to False. Ignored if <method> is not 'predict'. - 'soft' : predicts the class label based on the argmax of the sums of the predicted probabilities, which is recommended for an ensemble of well-calibrated classifiers. Ignored if estimator type is 'regressor'. Ignored if <return_pred> set to False. Ignored if <method> is not 'predict'. - 'auto' : use simple averaging for regressor's predcitions and for classifier's probabilities (if <method> is 'predict_proba'); if estimator type is 'classifier' and <method> is 'predict', set <averaging> to 'soft' for classifier with <predict_proba> attribute, set <averaging> to 'hard' for other. Ignored if <return_pred> set to False. - 'rank' : ranking probabilities along fold and averaging. Prefered for scoring like 'AUC-ROC'. - 'pass' : leave predictions of different folds separated. Column '_FOLD' will be added. - 'mean' : simple averaging of classifier's probabilities or regressor's predictions. Ignored if <return_pred> set to False, or <method> is not 'predict'. method : string, optional, default: 'predict' Invokes the passed method name of the passed estimator. For method='predict_proba', the columns correspond to the classes in sorted order. Ignored if return_pred=False. return_pred : bool (default=False) Return out-of-fold predictions (and test predictions, if X_new is defined) return_estimator : bool (default=False) Return fitted estimators n_jobs : int or None, optional (default=-1) The number of jobs to run in parallel. None means 1. verbose : int (default=1) Verbosity level n_digits : int (default=4) Verbose score(s) precision compact : bool (default=False) Print verbose in one line. Useful for evaluating series of estimators. train_score : bool (default=False) If True, print and return train score for each fold. y_transform : callable (default=None) Transform target before fit Returns ------- result : dict of array, float or Series Array of scores/predictions/time of the estimator for each run of the cross validation. If test_avg=True, arrays has shape [n_splits], otherwise [n_splits+1] except score & score_time. The possible keys for this ``dict`` are: ``fold`` : list of pair of list Two lists with trn/oof indices ``scorer`` : scorer object Func with signature scorer(estimator, X, y) ``val_score`` : array or dict of array, shape [n_splits] The score array for test scores on each cv split. If multimetric, return dict of array. ``trn_score`` : array or dict of array, shape [n_splits] The score array for train scores on each cv split. If multimetric, return dict of array. ``oof_pred`` : Series, shape [n_samples] Out-of-fold predictions. Ignored if return_pred=False. ``new_pred`` : Series, shape [m_samples] Test predictions (unseen data). Ignored if return_pred=False. ``fit_time`` : array of float, shape [n_splits] or [n_splits+1] The time for fitting the estimator on the train set for each cv split. ``pred_time`` : array of float, shape [n_splits] or [n_splits+1] Out-of-fold and test predictions time. Ignored if return_pred=False. ``score_time`` : array of float, shape [n_splits] Out-of-fold scores time for each cv split. ``concat_time`` : float Extra time spent on concatenation of predictions, importances or scores dictionaries. Ignored if all of return_pred, return_importance, return_score are set to False. ``estimator`` : list of estimator object, shape [n_splits] or [n_splits+1] The fitted estimator objects for each cv split (and ). Ignored if return_estimator=False. ``importance`` : list of arrays, shape [n_splits, n_features] List of importances. If estimator has <coef_> attribute, return np.abs(coef_). ``features`` : list, shape [n_features] List of features. """ # Check parameters X, y, groups = indexable(X, y, groups) X_new, _ = indexable(X_new, None) cv = check_cv(cv, y, classifier=is_classifier(estimator)) avg, method = _check_avg(estimator, avg_type, method) scorer = check_scoring(estimator, scoring) # Fit & predict logger = CVLogger(estimator, cv, verbose, n_digits, compact) logger.start() parallel = Parallel(max_nbytes='256M', pre_dispatch='2*n_jobs', n_jobs=n_jobs, require='sharedmem') if test_avg: # Stacking Type A (test averaging = True) result = parallel( delayed(_fit_predict)( copy(estimator), method, scorer, X, y, X_new, new_index, trn, oof, return_estimator, return_pred, fold, logger, train_score, y_transform) for fold, (trn, oof) in enumerate(cv.split(X, y, groups))) result = ld2dl(result) else: # Stacking Type B (test_averaging = False) result = parallel( (delayed(_fit_predict)( copy(estimator), method, scorer, X, y, None, None, trn, oof, return_estimator, return_pred, fold, logger, train_score, y_transform) for fold, (trn, oof) in enumerate(cv.split(X, y, groups)))) if verbose >= 2: print() logmsg('Fitting full train set...') result_new = _fit_predict(copy(estimator), method, None, X, y, X_new, new_index, None, None, return_estimator, return_pred, -1, logger, train_score, y_transform) result = ld2dl(result) for key, val in result_new.items(): if key in result: result[key].append(val) else: result[key] = [val] # Concat Predictions (& Feature Importances) needs_concat = ['oof_pred', 'new_pred', 'importance', 'val_score', 'trn_score'] if np.any(np.in1d(needs_concat, list(result))): tic = time() if 'oof_pred' in result: oof_preds = result['oof_pred'] oof_pred = _avg_preds(oof_preds, avg, X, y, y.index) result['oof_pred'] = oof_pred if 'new_pred' in result: new_preds = result['new_pred'] new_pred = _avg_preds(new_preds, avg, X_new, y, new_index) result['new_pred'] = new_pred for key in ['fit_time', 'score_time', 'pred_time']: if key in result: result[key] = np.array(result[key]) result['concat_time'] = time() - tic if hasattr(X, 'columns'): result['features'] = list(X.columns.values) result['datetime'] = datetime.now() result['scorer'] = scorer result['cv'] = cv # Final score logger.end(result) # Additional kwargs result.update(kwargs) return result
def _fit(self, X, y, groups): if self.forward: is_final = lambda subset: len(subset) >= self.k_features_ else: is_final = lambda subset: len(subset) <= self.k_features_ self.eval_subset(self.subset_, X, y, groups) self.score_ = self.subset_.score while not is_final(self.subset_): # STEP 1. Step Forward/Backward if self.verbose: logmsg('STEP {}'.format('FORWARD' if self.forward else 'BACKWARD')) if self.forward: updates = self.features_.remove(*self.subset_) else: updates = self.subset_ # Find Next Best Update score = -np.inf subset = None for feature in updates: # Include/Exclude Feature if self.forward: candidate = self.subset_.append(feature) else: candidate = self.subset_.remove(feature) candidate.parents = (self.subset_, ) # Evaluate Candidate try: self.eval_subset(candidate, X, y, groups) if candidate.score > score: score = candidate.score subset = candidate except KeyboardInterrupt: raise except: pass # Update Subset self.subset_ = subset self.score_ = score # Stop Criteria if not self.floating or is_final(self.subset_): continue # STEP 2. Step Backward/Forward if self.verbose: logmsg('STEP {}'.format('BACKWARD' if self.forward else 'FORWARD')) if not self.forward: updates = self.features_.remove(*self.subset_) else: updates = self.subset_ # Find Next Best Update score = -np.inf subset = None for feature in updates: # Exclude/Include Feature if not self.forward: candidate = self.subset_.append(feature) else: candidate = self.subset_.remove(feature) candidate.parents = (self.subset_, ) # Check if Already Exsists if candidate in self.trials_: continue # Evaluate Candidate try: self.eval_subset(candidate, X, y, groups) if candidate.score > score: score = candidate.score subset = candidate except KeyboardInterrupt: raise except: pass # Stop Criteria if score < self.score_: continue # Update Subset self.subset_ = subset self.score_ = score return self
def start(self): if not self.compact and self.verbose >= 2: utils.logmsg(' ' + self.name) print()
def _print_last(opt): ''' Print last trial score in optimizer. Parameters ---------- opt : instance Optimizator instance. ''' trial = opt.trials_.iloc[-1] if opt.verbose >= 1: # Iterations n = opt.max_iter if hasattr(opt, 'max_iter') else None k = opt.n_iters_ iters = '{}/{}'.format(k, n) if n else '{}'.format(k) if trial['status'] is 'ok': # Score score = '{:.{prec}f}'.format(trial['score'], prec=opt.n_digits) std = '{:.{prec}f}'.format(trial['score_std'], prec=opt.n_digits) # FIXME: colorlog & termcolor conflict... # https://github.com/borntyping/python-colorlog score = colored(score, 'yellow') if ( opt.trials_['score'].idxmax() is k - 1) else score std = colored(std, 'cyan') if ( opt.trials_['score_std'].idxmin() is k - 1) else std score = '{} ± {}'.format(score, std) # Estimated time of arrival (ETA) if hasattr(opt, 'max_time') and opt.max_time: eta0 = max(0, (opt.max_time - opt.total_time_)) else: eta0 = np.inf if hasattr(opt, 'max_iter') and opt.max_iter: eta1 = max(0, (opt.total_time_ / k) * (n - k)) else: eta1 = np.inf eta = min(eta0, eta1) if eta < np.inf: eta = secfmt(eta) eta = ' ETA: {}'.format(eta) else: eta = '' msg = 'ITER: {} SCORE: {}{}'.format(iters, score, eta) logmsg(msg) else: msg = 'ITER: {} - {}!'.format(iters, trial['status']) logmsg(msg) if opt.verbose >= 2: print(pd.Series(trial['params'], dtype='str')) print()
def _fit(self, X, y, groups): # Define crossover & mutation mate = CROSSOVER[self.crossover] self.toolbox.register("mate", mate, random_state=self.rstate) self.toolbox.register("mutate", mutSubset, random_state=self.rstate, indpb=self.mutation) # Define evaluation & selection self.toolbox.register("eval", self.eval_subset, X=X, y=y, groups=groups) self.toolbox.register("select", tools.selTournament, tournsize=5, fit_attr='score') while not self.n_gen or self.k_gen_ < self.n_gen: if self.verbose: logmsg(f'GENERATION {self.k_gen_+1}') try: offspring = [] # Apply crossover if self.k_gen_ > 0: weights = [ind.score for ind in self.population] weights = get_ranks(weights, normalize=True) else: weights = None for _ in range(self.pop_size): ind1, ind2 = self.rstate.choice(self.population, 2, p=weights) child, _ = self.toolbox.mate(ind1, ind2) offspring.append(child) # Apply mutation for ind in offspring: self.toolbox.mutate(ind) # Evaluate for ind in offspring: self.toolbox.eval(ind) # Select self.population = self.toolbox.select(offspring, k=self.pop_size) self.k_gen_ += 1 except KeyboardInterrupt: break if self.verbose: print() scores = [ind.score for ind in offspring] avg = np.mean(scores) std = np.std(scores) logmsg('SCORE AVG: {:.{n}f} ± {:.{n}f}'.format( avg, std, n=self.n_digits)) logmsg('SCORE MIN: {:.{n}f}'.format(np.min(scores), n=self.n_digits)) logmsg('SCORE MAX: {:.{n}f}'.format(np.max(scores), n=self.n_digits)) print() sizes = [ind.n_selected for ind in offspring] avg = int(np.mean(sizes)) std = int(np.std(sizes)) logmsg('SIZE AVG: {} ± {}'.format(avg, std)) logmsg('SIZE MIN: {}'.format(np.min(sizes))) logmsg('SIZE MAX: {}'.format(np.max(sizes))) print() times = [ind.eval_time for ind in offspring] time_avg = secfmt(np.mean(times)) time_sum = secfmt(np.sum(times)) logmsg('TIME SUM: {}'.format(time_sum)) logmsg('TIME AVG: {}'.format(time_avg)) print() return self