Пример #1
0
    def _pseudo_label(self, X):

        while not self.max_iter or self.n_iter_ < self.max_iter:

            # Select not added rows
            index = X.index.difference(self.X_.index)
            X_new = X.loc[index]

            if not len(index):
                break

            # Predict probabilities
            y_prob = self.estimator_.predict_proba(X_new)
            y_prob = pd.DataFrame(y_prob, index=X_new.index)
            y_new = y_prob.apply(lambda row: row.idxmax(), axis=1)

            # Mask rows with high certainty
            mask = (y_prob >= self.proba).any(axis=1)
            if not mask.any():
                break

            # Add labeled data & fit
            self.partial_fit(X_new[mask], y_new[mask])

            # Verbose
            if self.verbose:
                logmsg(f"ITER {self.n_iter_}: Add {mask.sum()} labels")

        return self.estimator_
Пример #2
0
 def _log(self, msg, end=' ' * 4):
     if not self.verbose:
         return
     if self.compact:
         print(msg, end=end)
     else:
         utils.logmsg(msg)
     time.sleep(0.01)
Пример #3
0
def crossval(estimator, cv, X, y, groups=None, X_new=None, new_index=None,
             scoring=None, test_avg=True, avg_type='auto', method='predict',
             return_pred=True, return_estimator=False, verbose=2, n_digits=4,
             n_jobs=None, compact=False, train_score=False, y_transform=None,
             **kwargs):
    """Evaluate metric(s) by cross-validation and also record fit/score time,
    feature importances and compute out-of-fold and test predictions.

    Parameters
    ----------
    estimator : estimator object
        The object to use to fit the data.

    cv : int, cross-validation generator or an iterable
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

        - None, to use the default 3-fold cross validation,
        - integer, to specify the number of folds in a `(Stratified)KFold`,
        - :term:`CV splitter`,
        - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass, :class:`StratifiedKFold` is used. In all
        other cases, :class:`KFold` is used.

    X : DataFrame, shape [n_samples, n_features]
        The data to fit, score and calculate out-of-fold predictions

    y : Series, shape [n_samples]
        The target variable to try to predict

    groups : None
        Group labels for the samples used while splitting the dataset into
        train/test set

    X_new : DataFrame, shape [m_samples, n_features] or None
        The unseed data to predict (test set)

    new_index : iterable or None
        Indices for test set if passed X_new is not DataFrames.
        Ignored if X_new is DataFrame or None.

    test_avg : bool
        Stacking strategy (essential parameter)

        - True: bagged predictions for test set (given that we have N folds,
                we fit N models on each fold's train data, then each model
                predicts test set, then we perform bagging: compute mean of
                predicted values (for regression or class probabilities) - or
                majority vote: compute mode (when predictions are class labels)

        - False: predictions for tests set (estimator is fitted once on full
                 train set, then predicts test set)

        Ignored if return_pred=False or X_new is not defined.

    scoring : string, callable or None, optional, default: None
        A string or a scorer callable object / function with signature
        ``scorer(estimator, X, y)`` which should return only a single value.
        If None, the estimator's default scorer (if available) is used.

    avg_type : string, {'mean', 'soft', 'hard', 'auto', 'rank', 'pass'} (default='auto')
        Averaging strategy for aggregating different CV folds predictions

        - 'hard' : use predicted class labels for majority rule voting.

                   Ignored if estimator type is 'regressor'.
                   Ignored if <return_pred> set to False.
                   Ignored if <method> is not 'predict'.

        - 'soft' : predicts the class label based on the argmax of the sums
                   of the predicted probabilities, which is recommended for
                   an ensemble of well-calibrated classifiers.

                   Ignored if estimator type is 'regressor'.
                   Ignored if <return_pred> set to False.
                   Ignored if <method> is not 'predict'.

        - 'auto' : use simple averaging for regressor's predcitions and for
                   classifier's probabilities (if <method> is 'predict_proba');

                   if estimator type is 'classifier' and <method> is 'predict',
                   set <averaging> to 'soft' for classifier with <predict_proba>
                   attribute, set <averaging> to 'hard' for other.

                   Ignored if <return_pred> set to False.

        - 'rank' : ranking probabilities along fold and averaging.

                   Prefered for scoring like 'AUC-ROC'.

        - 'pass' : leave predictions of different folds separated.

                   Column '_FOLD' will be added.

        - 'mean' : simple averaging of classifier's probabilities or
                   regressor's predictions.

        Ignored if <return_pred> set to False, or <method> is not 'predict'.

    method : string, optional, default: 'predict'
        Invokes the passed method name of the passed estimator. For
        method='predict_proba', the columns correspond to the classes
        in sorted order.

        Ignored if return_pred=False.

    return_pred : bool (default=False)
        Return out-of-fold predictions (and test predictions, if X_new is defined)

    return_estimator : bool (default=False)
        Return fitted estimators

    n_jobs : int or None, optional (default=-1)
        The number of jobs to run in parallel. None means 1.

    verbose : int (default=1)
        Verbosity level

    n_digits : int (default=4)
        Verbose score(s) precision

    compact : bool (default=False)
        Print verbose in one line. Useful for evaluating series of estimators.

    train_score : bool (default=False)
        If True, print and return train score for each fold.

    y_transform : callable (default=None)
        Transform target before fit


    Returns
    -------
    result : dict of array, float or Series
        Array of scores/predictions/time of the estimator for each run of the
        cross validation. If test_avg=True, arrays has shape [n_splits],
        otherwise [n_splits+1] except score & score_time.

        The possible keys for this ``dict`` are:

            ``fold`` : list of pair of list
                Two lists with trn/oof indices

            ``scorer`` : scorer object
                Func with signature scorer(estimator, X, y)

            ``val_score`` : array or dict of array, shape [n_splits]
                The score array for test scores on each cv split.
                If multimetric, return dict of array.

            ``trn_score`` : array or dict of array, shape [n_splits]
                The score array for train scores on each cv split.
                If multimetric, return dict of array.

            ``oof_pred`` : Series, shape [n_samples]
                Out-of-fold predictions.
                Ignored if return_pred=False.

            ``new_pred`` : Series, shape [m_samples]
                Test predictions (unseen data).
                Ignored if return_pred=False.

            ``fit_time`` : array of float, shape [n_splits] or [n_splits+1]
                The time for fitting the estimator on the train
                set for each cv split.

            ``pred_time`` : array of float, shape [n_splits] or [n_splits+1]
                Out-of-fold and test predictions time.
                Ignored if return_pred=False.

            ``score_time`` : array of float, shape [n_splits]
                Out-of-fold scores time for each cv split.

            ``concat_time`` : float
                Extra time spent on concatenation of predictions, importances
                or scores dictionaries. Ignored if all of return_pred,
                return_importance, return_score are set to False.

            ``estimator`` : list of estimator object, shape [n_splits] or [n_splits+1]
                The fitted estimator objects for each cv split (and ).
                Ignored if return_estimator=False.

            ``importance`` : list of arrays, shape [n_splits, n_features]
                List of importances. If estimator has <coef_> attribute,
                return np.abs(coef_).

            ``features`` : list, shape [n_features]
                List of features.

    """
    # Check parameters
    X, y, groups = indexable(X, y, groups)
    X_new, _ = indexable(X_new, None)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    avg, method = _check_avg(estimator, avg_type, method)
    scorer = check_scoring(estimator, scoring)

    # Fit & predict
    logger = CVLogger(estimator, cv, verbose, n_digits, compact)
    logger.start()

    parallel = Parallel(max_nbytes='256M', pre_dispatch='2*n_jobs',
                        n_jobs=n_jobs, require='sharedmem')

    if test_avg:

        # Stacking Type A (test averaging = True)
        result = parallel(
            delayed(_fit_predict)(
                copy(estimator), method, scorer, X, y, X_new, new_index,
                trn, oof, return_estimator, return_pred, fold, logger,
                train_score, y_transform)
            for fold, (trn, oof) in enumerate(cv.split(X, y, groups)))

        result = ld2dl(result)

    else:

        # Stacking Type B (test_averaging = False)
        result = parallel(
            (delayed(_fit_predict)(
                copy(estimator), method, scorer, X, y, None, None, trn, oof,
                return_estimator, return_pred, fold, logger, train_score,
                y_transform)
            for fold, (trn, oof) in enumerate(cv.split(X, y, groups))))

        if verbose >= 2:
            print()
            logmsg('Fitting full train set...')

        result_new = _fit_predict(copy(estimator), method, None, X, y, X_new,
                                  new_index, None, None, return_estimator,
                                  return_pred, -1, logger, train_score,
                                  y_transform)

        result = ld2dl(result)
        for key, val in result_new.items():
            if key in result:
                result[key].append(val)
            else:
                result[key] = [val]


    # Concat Predictions (& Feature Importances)
    needs_concat = ['oof_pred', 'new_pred', 'importance', 'val_score', 'trn_score']
    if np.any(np.in1d(needs_concat, list(result))):

        tic = time()

        if 'oof_pred' in result:
            oof_preds = result['oof_pred']
            oof_pred = _avg_preds(oof_preds, avg, X, y, y.index)
            result['oof_pred'] = oof_pred

        if 'new_pred' in result:
            new_preds = result['new_pred']
            new_pred = _avg_preds(new_preds, avg, X_new, y, new_index)
            result['new_pred'] = new_pred

        for key in ['fit_time', 'score_time', 'pred_time']:
            if key in result:
                result[key] = np.array(result[key])

        result['concat_time'] = time() - tic

    if hasattr(X, 'columns'): result['features'] = list(X.columns.values)

    result['datetime'] = datetime.now()
    result['scorer'] = scorer
    result['cv'] = cv

    # Final score
    logger.end(result)

    # Additional kwargs
    result.update(kwargs)

    return result
Пример #4
0
    def _fit(self, X, y, groups):

        if self.forward:
            is_final = lambda subset: len(subset) >= self.k_features_
        else:
            is_final = lambda subset: len(subset) <= self.k_features_

            self.eval_subset(self.subset_, X, y, groups)
            self.score_ = self.subset_.score


        while not is_final(self.subset_):

            # STEP 1. Step Forward/Backward
            if self.verbose:
                logmsg('STEP {}'.format('FORWARD' if self.forward else 'BACKWARD'))

            if self.forward:
                updates = self.features_.remove(*self.subset_)
            else:
                updates = self.subset_

            # Find Next Best Update
            score  = -np.inf
            subset = None

            for feature in updates:

                # Include/Exclude Feature
                if self.forward:
                    candidate = self.subset_.append(feature)
                else:
                    candidate = self.subset_.remove(feature)

                candidate.parents = (self.subset_, )

                # Evaluate Candidate
                try:
                    self.eval_subset(candidate, X, y, groups)

                    if candidate.score > score:
                        score  = candidate.score
                        subset = candidate

                except KeyboardInterrupt:
                    raise

                except:
                    pass

            # Update Subset
            self.subset_ = subset
            self.score_  = score

            # Stop Criteria
            if not self.floating or is_final(self.subset_):
                continue


            # STEP 2. Step Backward/Forward
            if self.verbose:
                logmsg('STEP {}'.format('BACKWARD' if self.forward else 'FORWARD'))

            if not self.forward:
                updates = self.features_.remove(*self.subset_)
            else:
                updates = self.subset_

            # Find Next Best Update
            score  = -np.inf
            subset = None

            for feature in updates:

                # Exclude/Include Feature
                if not self.forward:
                    candidate = self.subset_.append(feature)
                else:
                    candidate = self.subset_.remove(feature)

                candidate.parents = (self.subset_, )

                # Check if Already Exsists
                if candidate in self.trials_:
                    continue

                # Evaluate Candidate
                try:
                    self.eval_subset(candidate, X, y, groups)

                    if candidate.score > score:
                        score  = candidate.score
                        subset = candidate

                except KeyboardInterrupt:
                    raise

                except:
                    pass

            # Stop Criteria
            if score < self.score_:
                continue

            # Update Subset
            self.subset_ = subset
            self.score_  = score

        return self
Пример #5
0
    def start(self):

        if not self.compact and self.verbose >= 2:
            utils.logmsg(' ' + self.name)
            print()
Пример #6
0
def _print_last(opt):
    '''
    Print last trial score in optimizer.

    Parameters
    ----------
    opt : instance
        Optimizator instance.

    '''
    trial = opt.trials_.iloc[-1]

    if opt.verbose >= 1:

        # Iterations
        n = opt.max_iter if hasattr(opt, 'max_iter') else None
        k = opt.n_iters_
        iters = '{}/{}'.format(k, n) if n else '{}'.format(k)

        if trial['status'] is 'ok':

            # Score
            score = '{:.{prec}f}'.format(trial['score'], prec=opt.n_digits)
            std = '{:.{prec}f}'.format(trial['score_std'], prec=opt.n_digits)

            # FIXME: colorlog & termcolor conflict...
            # https://github.com/borntyping/python-colorlog

            score = colored(score, 'yellow') if (
                opt.trials_['score'].idxmax() is k - 1) else score
            std = colored(std, 'cyan') if (
                opt.trials_['score_std'].idxmin() is k - 1) else std

            score = '{} ± {}'.format(score, std)

            # Estimated time of arrival (ETA)
            if hasattr(opt, 'max_time') and opt.max_time:
                eta0 = max(0, (opt.max_time - opt.total_time_))
            else:
                eta0 = np.inf

            if hasattr(opt, 'max_iter') and opt.max_iter:
                eta1 = max(0, (opt.total_time_ / k) * (n - k))
            else:
                eta1 = np.inf

            eta = min(eta0, eta1)
            if eta < np.inf:
                eta = secfmt(eta)
                eta = '      ETA: {}'.format(eta)
            else:
                eta = ''

            msg = 'ITER: {}      SCORE: {}{}'.format(iters, score, eta)
            logmsg(msg)

        else:
            msg = 'ITER: {} - {}!'.format(iters, trial['status'])
            logmsg(msg)

    if opt.verbose >= 2:
        print(pd.Series(trial['params'], dtype='str'))
        print()
Пример #7
0
    def _fit(self, X, y, groups):

        # Define crossover & mutation
        mate = CROSSOVER[self.crossover]
        self.toolbox.register("mate", mate, random_state=self.rstate)
        self.toolbox.register("mutate",
                              mutSubset,
                              random_state=self.rstate,
                              indpb=self.mutation)

        # Define evaluation & selection
        self.toolbox.register("eval",
                              self.eval_subset,
                              X=X,
                              y=y,
                              groups=groups)
        self.toolbox.register("select",
                              tools.selTournament,
                              tournsize=5,
                              fit_attr='score')

        while not self.n_gen or self.k_gen_ < self.n_gen:

            if self.verbose:
                logmsg(f'GENERATION {self.k_gen_+1}')

            try:
                offspring = []

                # Apply crossover
                if self.k_gen_ > 0:
                    weights = [ind.score for ind in self.population]
                    weights = get_ranks(weights, normalize=True)
                else:
                    weights = None

                for _ in range(self.pop_size):
                    ind1, ind2 = self.rstate.choice(self.population,
                                                    2,
                                                    p=weights)
                    child, _ = self.toolbox.mate(ind1, ind2)
                    offspring.append(child)

                # Apply mutation
                for ind in offspring:
                    self.toolbox.mutate(ind)

                # Evaluate
                for ind in offspring:
                    self.toolbox.eval(ind)

                # Select
                self.population = self.toolbox.select(offspring,
                                                      k=self.pop_size)
                self.k_gen_ += 1

            except KeyboardInterrupt:
                break

            if self.verbose:
                print()

                scores = [ind.score for ind in offspring]
                avg = np.mean(scores)
                std = np.std(scores)

                logmsg('SCORE AVG: {:.{n}f} ± {:.{n}f}'.format(
                    avg, std, n=self.n_digits))
                logmsg('SCORE MIN: {:.{n}f}'.format(np.min(scores),
                                                    n=self.n_digits))
                logmsg('SCORE MAX: {:.{n}f}'.format(np.max(scores),
                                                    n=self.n_digits))
                print()

                sizes = [ind.n_selected for ind in offspring]
                avg = int(np.mean(sizes))
                std = int(np.std(sizes))

                logmsg('SIZE AVG: {} ± {}'.format(avg, std))
                logmsg('SIZE MIN: {}'.format(np.min(sizes)))
                logmsg('SIZE MAX: {}'.format(np.max(sizes)))
                print()

                times = [ind.eval_time for ind in offspring]
                time_avg = secfmt(np.mean(times))
                time_sum = secfmt(np.sum(times))

                logmsg('TIME SUM: {}'.format(time_sum))
                logmsg('TIME AVG: {}'.format(time_avg))
                print()

        return self