def estimators_samples_(self):
        sample_masks = []
        for _, sample_indices in self._get_estimators_indices():
            mask = indices_to_mask(sample_indices, self._n_samples)
            sample_masks.append(mask)

        return sample_masks
示例#2
0
    def _set_oob_score(self, X, y):
        n_samples = y.shape[0]

        predictions = np.zeros((n_samples, ))
        n_predictions = np.zeros((n_samples, ))

        for estimator, samples, features in zip(
                self.estimators_, self.sequentially_bootstrapped_samples_,
                self.estimators_features_):
            # Create mask for OOB samples
            mask = ~indices_to_mask(samples, n_samples)

            predictions[mask] += estimator.predict((X[mask, :])[:, features])
            n_predictions[mask] += 1

        if (n_predictions == 0).any():
            warn("Some inputs do not have OOB scores. "
                 "This probably means too few estimators were used "
                 "to compute any reliable oob estimates.")
            n_predictions[n_predictions == 0] = 1

        predictions /= n_predictions

        self.oob_prediction_ = predictions
        self.oob_score_ = r2_score(y, predictions)
示例#3
0
def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight,
                               seeds, total_n_estimators, verbose):
    """Private function used to build a batch of estimators within a job."""
    # Retrieve settings
    n_samples, n_features = X.shape
    max_features = ensemble._max_features
    max_samples = ensemble._max_samples
    bootstrap = ensemble.bootstrap
    bootstrap_features = ensemble.bootstrap_features
    support_sample_weight = has_fit_parameter(ensemble.base_estimator_,
                                              "sample_weight")
    if not support_sample_weight and sample_weight is not None:
        raise ValueError("The base estimator doesn't support sample weight")

    # Build estimators
    estimators = []
    estimators_features = []

    for i in range(n_estimators):
        if verbose > 1:
            print("Building estimator %d of %d for this parallel run "
                  "(total %d)..." % (i + 1, n_estimators, total_n_estimators))

        random_state = np.random.RandomState(seeds[i])
        estimator = ensemble._make_estimator(append=False,
                                             random_state=random_state)

        # Draw random feature, sample indices
        features, indices = _generate_bagging_indices(random_state,
                                                      bootstrap_features,
                                                      bootstrap, n_features,
                                                      n_samples, max_features,
                                                      max_samples)

        # Draw samples, using sample weights, and then fit
        support_sample_weight=False
        if support_sample_weight:
            if sample_weight is None:
                curr_sample_weight = np.ones((n_samples,))
            else:
                curr_sample_weight = sample_weight.copy()

            if bootstrap:
                sample_counts = np.bincount(indices, minlength=n_samples)
                curr_sample_weight *= sample_counts
            else:
                not_indices_mask = ~indices_to_mask(indices, n_samples)
                curr_sample_weight[not_indices_mask] = 1.E-6

            estimator.fit(X[:, features], y, sample_weight=curr_sample_weight)

        # Draw samples, using a mask, and then fit
        else:
            estimator.fit((X[indices])[:, features], y[indices])

        estimators.append(estimator)
        estimators_features.append(features)

    return estimators, estimators_features
def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight,
                               seeds, total_n_estimators, verbose):

    n_samples, n_features = X.shape
    max_features = ensemble._max_features
    max_samples = ensemble._max_samples
    bootstrap = ensemble.bootstrap
    bootstrap_features = ensemble.bootstrap_features
    support_sample_weight = has_fit_parameter(ensemble.base_estimator_,
                                              "sample_weight")
    if not support_sample_weight and sample_weight is not None:
        raise ValueError("The base estimator doesn't support sample weight")

    estimators = []
    estimators_features = []

    for i in range(n_estimators):
        if verbose > 1:
            print("Building estimator %d of %d for this parallel run "
                  "(total %d)..." % (i + 1, n_estimators, total_n_estimators))

        random_state = np.random.RandomState(seeds[i])
        estimator = ensemble._make_estimator(append=False,
                                             random_state=random_state)

        iP = [pair[0] for pair in enumerate(y) if pair[1] == 1]
        iU = [pair[0] for pair in enumerate(y) if pair[1] < 1]
        features, indices = _generate_bagging_indices(random_state,
                                                      bootstrap_features,
                                                      bootstrap, n_features,
                                                      len(iU), max_features,
                                                      max_samples)
        indices = [iU[i] for i in indices] + iP

        if support_sample_weight:
            if sample_weight is None:
                curr_sample_weight = np.ones((n_samples, ))
            else:
                curr_sample_weight = sample_weight.copy()

            if bootstrap:
                sample_counts = np.bincount(indices, minlength=n_samples)
                curr_sample_weight *= sample_counts
            else:
                not_indices_mask = ~indices_to_mask(indices, n_samples)
                curr_sample_weight[not_indices_mask] = 0

            estimator.fit(X[:, features], y, sample_weight=curr_sample_weight)

        else:
            estimator.fit((X[indices])[:, features], y[indices])

        estimators.append(estimator)
        estimators_features.append(features)

    return estimators, estimators_features
示例#5
0
def _predict_score_single_estimator(estimator, X, features, samples, split,
                                    n_samples):
    # Create mask for OOB samples
    samples = indices_to_mask(samples, n_samples)
    unsampled = ~samples

    predictions = np.empty(n_samples, dtype=np.int8)
    predictions[unsampled] = estimator.predict(X[np.ix_(unsampled, features)])

    oob_score = accuracy_score(split[unsampled], predictions[unsampled])
    return predictions, oob_score
示例#6
0
def _masked_bagging_indices(random_state, bootstrap_features,
                            bootstrap_samples, n_features, n_samples,
                            max_features, max_samples):
    """Monkey-patch to always get a mask instead of indices"""
    feature_indices, sample_indices = old_generate(random_state,
                                                   bootstrap_features,
                                                   bootstrap_samples,
                                                   n_features, n_samples,
                                                   max_features, max_samples)
    sample_indices = indices_to_mask(sample_indices, n_samples)

    return feature_indices, sample_indices
示例#7
0
    def estimators_samples_(self):
        """The subset of drawn samples for each base estimator.
        Returns a dynamically generated list of boolean masks identifying
        the samples used for fitting each member of the ensemble, i.e.,
        the in-bag samples.
        Note: the list is re-created at each call to the property in order
        to reduce the object memory footprint by not storing the sampling
        data. Thus fetching the property may be slower than expected.
        """
        sample_masks = []
        for _, sample_indices in self._get_estimators_indices():
            mask = indices_to_mask(sample_indices, self._n_samples)
            sample_masks.append(mask)

        return sample_masks
示例#8
0
def score_precision_recall(X,
              y,
              rules: List[List[str]],
              samples: List[List[int]],
              features: List[List[int]],
              feature_names: List[str],
              oob: bool = True) -> List[Rule]:

    scored_rules = []

    for curr_rules, curr_samples, curr_features in zip(rules, samples, features):

        # Create mask for OOB samples
        mask = ~indices_to_mask(curr_samples, X.shape[0])
        if sum(mask) == 0:
            if oob:
                warn(
                    "OOB evaluation not possible: doing it in-bag. Performance evaluation is likely to be wrong"
                    " (overfitting) and selected rules are likely to not perform well! Please use max_samples < 1."
                )
            mask = curr_samples

        # XXX todo: idem without dataframe

        X_oob = pd.DataFrame(
            (X[mask, :])[:, curr_features],
            columns=np.array(feature_names)[curr_features]
        )

        if X_oob.shape[1] <= 1:  # otherwise pandas bug (cf. issue #16363)
            return []

        y_oob = y[mask]
        y_oob = np.array((y_oob != 0))

        # Add OOB performances to rules:
        scored_rules += [
            Rule(r, args=_eval_rule_perf(r, X_oob, y_oob))
            for r in set(curr_rules)
        ]

    return scored_rules
示例#9
0
    def _set_oob_score(self, X, y):
        n_samples = y.shape[0]
        n_classes_ = self.n_classes_

        predictions = np.zeros((n_samples, n_classes_))

        for estimator, samples, features in zip(
                self.estimators_, self.sequentially_bootstrapped_samples_,
                self.estimators_features_):
            # Create mask for OOB samples
            mask = ~indices_to_mask(samples, n_samples)

            if hasattr(estimator, "predict_proba"):
                predictions[mask, :] += estimator.predict_proba(
                    (X[mask, :])[:, features])

            else:
                p = estimator.predict((X[mask, :])[:, features])
                j = 0

                for i in range(n_samples):
                    if mask[i]:
                        predictions[i, p[j]] += 1
                        j += 1

        if (predictions.sum(axis=1) == 0).any():
            warn("Some inputs do not have OOB scores. "
                 "This probably means too few estimators were used "
                 "to compute any reliable oob estimates.")

        oob_decision_function = np.divide(
            predictions,
            predictions.sum(axis=1)[:, np.newaxis],
            out=np.zeros_like(predictions),
            where=predictions.sum(axis=1)[:, np.newaxis] != 0)
        oob_score = accuracy_score(y, np.argmax(predictions, axis=1))

        self.oob_decision_function_ = oob_decision_function
        self.oob_score_ = oob_score
示例#10
0
def _parallel_build_estimators(n_estimators,
                               ensemble,
                               X,
                               y,
                               sample_weight,
                               seeds_features,
                               seeds_samples,
                               seeds_max_features,
                               total_n_estimators,
                               verbose,
                               start_index,
                               draw_max_features=False,
                               circular_features=False):
    """Private function used to build a batch of estimators within a job."""
    # Retrieve settings
    n_samples, n_features = X.shape
    max_features = ensemble._max_features
    max_samples = ensemble._max_samples
    bootstrap = ensemble.bootstrap
    bootstrap_features = ensemble.bootstrap_features
    support_sample_weight = has_fit_parameter(ensemble.base_estimator_,
                                              "sample_weight")
    if not support_sample_weight and sample_weight is not None:
        raise ValueError("The base estimator doesn't support sample weight")

    # Build estimators
    estimators = []
    estimators_features = []
    estimators_samples = []
    estimators_splits = []

    for i in range(n_estimators):
        if verbose > 2:
            print(
                "Building estimator %d of %d for this parallel run (total %d)..."
                % (i + 1, n_estimators, total_n_estimators))

        random_state_max_features = np.random.RandomState(
            seeds_max_features[i])
        random_state_features = np.random.RandomState(seeds_features[i])
        random_state = np.random.RandomState(seeds_samples[i])
        estimator = ensemble._make_estimator(append=False,
                                             random_state=random_state)

        # Draw random feature, sample indices
        if circular_features:
            n_features_window = ensemble.window_size
            max_features_window = max_features
        else:
            n_features_window = min(ensemble.window_size,
                                    n_features - start_index[i])
            max_features_window = min(max_features,
                                      n_features - start_index[i])

        features, indices = _generate_bagging_indices(
            random_state_features,
            random_state,
            random_state_max_features,
            bootstrap_features,
            bootstrap,
            n_features_window,
            n_samples,
            max_features_window,
            max_samples,
            draw_max_features=draw_max_features)

        features += start_index[i]

        # ensure not going outside range, take the first ones instead
        np.mod(features, n_features, out=features)

        # Draw samples, using sample weights, and then fit
        y_binary = random_binarizer(y)
        if support_sample_weight:
            if sample_weight is None:
                curr_sample_weight = np.ones((n_samples, ))
            else:
                curr_sample_weight = sample_weight.copy()

            if bootstrap:
                sample_counts = bincount(indices, minlength=n_samples)
                curr_sample_weight *= sample_counts
            else:
                not_indices_mask = ~indices_to_mask(indices, n_samples)
                curr_sample_weight[not_indices_mask] = 0

            estimator.fit(X[:, features],
                          y_binary,
                          sample_weight=curr_sample_weight)

        # Draw samples, using a mask, and then fit
        else:
            estimator.fit((X[indices])[:, features], y_binary[indices])

        estimators.append(estimator)
        estimators_features.append(features)
        estimators_samples.append(indices)
        estimators_splits.append(y_binary)

    return estimators, estimators_features, estimators_samples, estimators_splits
示例#11
0
文件: ospa.py 项目: rth/ramp-workflow
def ospa_single(y_true, y_pred, minipatch=None):
    """
    OSPA score on single patch. See docstring of `ospa` for more info.

    Parameters
    ----------
    y_true, y_pred : ndarray of shape (3, x)
        arrays of (x, y, radius)
    minipatch : [row_min, row_max, col_min, col_max], optional
        Bounds of the internal scoring region (default is None)

    Returns
    -------
    (iou_sum, n_pred, n_total)
        float - sum of ious of matched entries
        int - number of matched entries
        int - total number of entries

    """
    n_true = len(y_true)
    n_pred = len(y_pred)

    # No craters and none found
    if n_true == 0 and n_pred == 0:
        return 0, 0, 0

    # Mask of entries that lie within the minipatch
    if minipatch is not None:
        true_in_minipatch = _select_minipatch_tuples(y_true, minipatch)
        pred_in_minipatch = _select_minipatch_tuples(y_pred, minipatch)
    else:
        true_in_minipatch = np.ones(len(y_true)).astype(bool)
        pred_in_minipatch = np.ones(len(y_pred)).astype(bool)

    n_minipatch = true_in_minipatch.sum() + pred_in_minipatch.sum()

    # No craters and some found or existing craters but non found
    if n_true == 0 or n_pred == 0:
        return 0, 0, n_minipatch

    # First matching
    id_true, id_pred, ious = _match_tuples(y_true, y_pred)

    # For each set of entries (true and pred) create an array with
    # the iou corresponding to each object
    iou_true = np.zeros(len(y_true))
    iou_true[id_true] = ious
    iou_pred = np.zeros(len(y_pred))
    iou_pred[id_pred] = ious

    # Mask of matched entries
    true_matched = indices_to_mask(id_true, n_true)
    pred_matched = indices_to_mask(id_pred, n_pred)

    # Counting
    true_count = true_matched & true_in_minipatch
    pred_count = pred_matched & pred_in_minipatch

    # IoU computation on the final list
    iou_global = iou_true[true_count].sum() + iou_pred[pred_count].sum()
    n_count = true_count.sum() + pred_count.sum()

    return iou_global, n_count, n_minipatch
示例#12
0
    def fit(self, X, y, sample_weight=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape (n_samples,)
            Target vector relative to X. Has to follow the convention 0 for
            normal data, 1 for anomalies.

        sample_weight : array-like, shape (n_samples,) optional
            Array of weights that are assigned to individual samples, typically
            the amount in case of transactions data. Used to grow regression
            trees producing further rules to be tested.
            If not provided, then each sample is given unit weight.

        Returns
        -------
        self : object
            Returns self.
        """

        X, y = check_X_y(X, y)
        check_classification_targets(y)
        self.n_features_ = X.shape[1]

        self.classes_ = np.unique(y)
        n_classes = len(self.classes_)

        if n_classes < 2:
            raise ValueError("This method needs samples of at least 2 classes"
                             " in the data, but the data contains only one"
                             " class: %r" % self.classes_[0])

        if not isinstance(self.max_depth_duplication, int) \
                and self.max_depth_duplication is not None:
            raise ValueError("max_depth_duplication should be an integer")
        if not set(self.classes_) == set([0, 1]):
            warn("Found labels %s. This method assumes target class to be"
                 " labeled as 1 and normal data to be labeled as 0. Any label"
                 " different from 0 will be considered as being from the"
                 " target class." % set(self.classes_))
            y = (y > 0)

        # ensure that max_samples is in [1, n_samples]:
        n_samples = X.shape[0]

        if isinstance(self.max_samples, six.string_types):
            raise ValueError('max_samples (%s) is not supported.'
                             'Valid choices are: "auto", int or'
                             'float' % self.max_samples)

        elif isinstance(self.max_samples, INTEGER_TYPES):
            if self.max_samples > n_samples:
                warn("max_samples (%s) is greater than the "
                     "total number of samples (%s). max_samples "
                     "will be set to n_samples for estimation." %
                     (self.max_samples, n_samples))
                max_samples = n_samples
            else:
                max_samples = self.max_samples
        else:  # float
            if not (0. < self.max_samples <= 1.):
                raise ValueError("max_samples must be in (0, 1], got %r" %
                                 self.max_samples)
            max_samples = int(self.max_samples * X.shape[0])

        self.max_samples_ = max_samples

        self.rules_ = {}
        self.estimators_ = []
        self.estimators_samples_ = []
        self.estimators_features_ = []

        # default columns names :
        feature_names_ = [
            BASE_FEATURE_NAME + x for x in np.arange(X.shape[1]).astype(str)
        ]
        if self.feature_names is not None:
            self.feature_dict_ = {
                BASE_FEATURE_NAME + str(i): feat
                for i, feat in enumerate(self.feature_names)
            }
        else:
            self.feature_dict_ = {
                BASE_FEATURE_NAME + str(i): feat
                for i, feat in enumerate(feature_names_)
            }
        self.feature_names_ = feature_names_

        clfs = []
        regs = []

        self._max_depths = self.max_depth \
            if isinstance(self.max_depth, Iterable) else [self.max_depth]

        for max_depth in self._max_depths:
            bagging_clf = BaggingClassifier(
                base_estimator=DecisionTreeClassifier(
                    max_depth=max_depth,
                    max_features=self.max_features,
                    min_samples_split=self.min_samples_split),
                n_estimators=self.n_estimators,
                max_samples=self.max_samples_,
                max_features=self.max_samples_features,
                bootstrap=self.bootstrap,
                bootstrap_features=self.bootstrap_features,
                # oob_score=... XXX may be added
                # if selection on tree perf needed.
                # warm_start=... XXX may be added to increase computation perf.
                n_jobs=self.n_jobs,
                random_state=self.random_state,
                verbose=self.verbose)

            bagging_reg = BaggingRegressor(
                base_estimator=DecisionTreeRegressor(
                    max_depth=max_depth,
                    max_features=self.max_features,
                    min_samples_split=self.min_samples_split),
                n_estimators=self.n_estimators,
                max_samples=self.max_samples_,
                max_features=self.max_samples_features,
                bootstrap=self.bootstrap,
                bootstrap_features=self.bootstrap_features,
                # oob_score=... XXX may be added
                # if selection on tree perf needed.
                # warm_start=... XXX may be added to increase computation perf.
                n_jobs=self.n_jobs,
                random_state=self.random_state,
                verbose=self.verbose)

            clfs.append(bagging_clf)
            regs.append(bagging_reg)

        # define regression target:
        if sample_weight is not None:
            if sample_weight is not None:
                sample_weight = check_array(sample_weight, ensure_2d=False)
            weights = sample_weight - sample_weight.min()
            contamination = float(sum(y)) / len(y)
            y_reg = (pow(weights, 0.5) * 0.5 / contamination * (y > 0) - pow(
                (weights).mean(), 0.5) * (y == 0))
            y_reg = 1. / (1 + np.exp(-y_reg))  # sigmoid
        else:
            y_reg = y  # same as an other classification bagging

        for clf in clfs:
            clf.fit(X, y)
            self.estimators_ += clf.estimators_
            self.estimators_samples_ += clf.estimators_samples_
            self.estimators_features_ += clf.estimators_features_

        for reg in regs:
            reg.fit(X, y_reg)
            self.estimators_ += reg.estimators_
            self.estimators_samples_ += reg.estimators_samples_
            self.estimators_features_ += reg.estimators_features_

        rules_ = []
        for estimator, samples, features in zip(self.estimators_,
                                                self.estimators_samples_,
                                                self.estimators_features_):

            # Create mask for OOB samples
            mask = ~indices_to_mask(samples, n_samples)

            if sum(mask) == 0:
                warn("OOB evaluation not possible: doing it in-bag."
                     " Performance evaluation is likely to be wrong"
                     " (overfitting) and selected rules are likely to"
                     " not perform well! Please use max_samples < 1.")
                mask = samples
            rules_from_tree = self._tree_to_rules(
                estimator,
                np.array(self.feature_names_)[features])

            # XXX todo: idem without dataframe
            X_oob = pandas.DataFrame(
                (X[mask, :])[:, features],
                columns=np.array(self.feature_names_)[features])

            if X_oob.shape[1] > 1:  # otherwise pandas bug (cf. issue #16363)
                y_oob = y[mask]
                y_oob = np.array((y_oob != 0))

                # Add OOB performances to rules:
                rules_from_tree = [(r, self._eval_rule_perf(r, X_oob, y_oob))
                                   for r in set(rules_from_tree)]
                rules_ += rules_from_tree

        # Factorize rules before semantic tree filtering
        rules_ = [
            tuple(rule) for rule in [Rule(r, args=args) for r, args in rules_]
        ]

        # keep only rules verifying precision_min and recall_min:
        for rule, score in rules_:
            if score[0] >= self.precision_min and score[1] >= self.recall_min:
                if rule in self.rules_:
                    # update the score to the new mean
                    c = self.rules_[rule][2] + 1
                    b = self.rules_[rule][1] + 1. / c * (score[1] -
                                                         self.rules_[rule][1])
                    a = self.rules_[rule][0] + 1. / c * (score[0] -
                                                         self.rules_[rule][0])

                    self.rules_[rule] = (a, b, c)
                else:
                    self.rules_[rule] = (score[0], score[1], 1)

        self.rules_ = sorted(self.rules_.items(),
                             key=lambda x: (x[1][0], x[1][1]),
                             reverse=True)

        # Deduplicate the rule using semantic tree
        if self.max_depth_duplication is not None:
            self.rules_ = self.deduplicate(self.rules_)

        self.rules_ = sorted(self.rules_, key=lambda x: -self.f1_score(x))
        self.rules_without_feature_names_ = self.rules_

        # Replace generic feature names by real feature names
        self.rules_ = [(replace_feature_name(rule, self.feature_dict_), perf)
                       for rule, perf in self.rules_]

        return self
示例#13
0
def bootstrap_prediction(X,
                         y,
                         score_func,
                         base_estimator=None,
                         n_estimators=10,
                         max_samples=1.0,
                         max_features=1.0,
                         bootstrap=True,
                         bootstrap_features=False,
                         n_jobs=None,
                         random_state=None):
    """Bootstrap the scores from an `sklearn` estimator.

    Args:
        X (array-like, dtype=float64, , size=[n_samples, n_features]): Feature matrix.
        y (array, dtype=float64, size=[n_samples]): Target vector.
        score_func (callable): Score function (or loss function) with signature
            score_func(y, y_pred, **kwargs).
        base_estimator (object or None, optional): Defaults to None. The base estimator
            to fit on random subsets of the dataset. If None, then the base estimator
            is a decision tree.
        n_estimators (int, optional): Defaults to 10. The number of base estimators in
            the ensemble.
        max_samples (int or float, optional): Defaults to 1.0. The number of samples
            to draw from X to train each base estimator. If int, then draw max_samples
            samples. If float, then draw max_samples * X.shape[0] samples.
        max_features (int or float, optional): Defaults to 1.0. The number of features
            to draw from X to train each base estimator. If int, then draw max_features
            features. If float, then draw max_features * X.shape[1] features.
        bootstrap (bool, optional): Defaults to True. Whether samples are drawn with
            replacement.
        bootstrap_features (bool, optional): Defaults to False. Whether features are
            drawn with replacement.
        n_jobs (int or None, optional): Defaults to None. The number of jobs to run in
            parallel for both fit and predict. None means 1 unless in a
            joblib.parallel_backend context.
        random_state (int, RandomState instance or None, optional): Defaults to None.
            If int, random_state is the seed used by the random number generator; If
            RandomState instance, random_state is the random number generator; If None,
            the random number generator is the RandomState instance used by np.random.

    Returns:
        numpy.ndarray: Distribution of score function statistic.
    """

    bag = BaggingClassifier(base_estimator=base_estimator,
                            n_estimators=n_estimators,
                            max_samples=max_samples,
                            max_features=max_features,
                            bootstrap=bootstrap,
                            bootstrap_features=bootstrap_features,
                            n_jobs=n_jobs,
                            random_state=random_state)
    bag.fit(X, y)

    stats = []
    for estimator, samples in zip(bag.estimators_, bag.estimators_samples_):
        # Create mask for OOB samples
        mask = ~indices_to_mask(samples, len(y))

        # Compute predictions on out-of-bag samples
        y_pred = estimator.predict(X[mask])

        # Compute statistic
        stat = score_func(y[mask], y_pred)
        stats.append(stat)

    stats = np.array(stats)
    return stats
示例#14
0
 def _get_support_mask(self):
     check_is_fitted(self, 'scores_')
     mask = indices_to_mask(self.indices_, self.num_features_)
     return mask