예제 #1
0
    def fit_weights(self,
                    X,
                    y,
                    sample_weight=None,
                    parallel_profile=None,
                    features=None):
        if features is not None:
            for name, estimator in self.items():
                if estimator.features is not None:
                    print('Overwriting features of estimator ' + name)
                self[name].set_params(features=features)

        # allow specifying different weights for each classifier
        if isinstance(sample_weight, OrderedDict):
            sample_weight = list(sample_weight.values())
        else:
            sample_weight = [sample_weight] * len(self)

        start_time = time.time()
        result = utils.map_on_cluster(parallel_profile, train_estimator,
                                      list(self.keys()), list(self.values()),
                                      [X] * len(self), [y] * len(self),
                                      sample_weight)
        for status, data in result:
            if status == 'success':
                name, estimator, spent_time = data
                self[name] = estimator
                print('model {:12} was trained in {:.2f} seconds'.format(
                    name, spent_time))
            else:
                print('Problem while training on the node, report:\n', data)

        print("Totally spent {:.2f} seconds on training".format(time.time() -
                                                                start_time))
        return self
예제 #2
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            parallel_profile=None,
            features=None):
        """
        Train all estimators on the same data.
        :param X: pandas.DataFrame of shape [n_samples, n_features] with features
        :param y: array-like of shape [n_samples] with labels of samples
        :param sample_weight: weights of events,
        array-like of shape [n_samples] or None if all weights are equal
        :param features: features to train estimators
        If None, estimators will be trained on `estimator.features`
        :type features: None or list[str]
        :param parallel_profile: profile of parallel execution system or None
        :type parallel_profile: None or str
        :return: self
        """
        if features is not None:
            for name, estimator in self.items():
                if estimator.features is not None:
                    print('Overwriting features of estimator ' + name)
                self[name].set_params(features=features)

        start_time = time.time()
        labels = []
        for key in self.keys():
            labels.append((y == names_labels_correspondence[key]) * 1)
        result = map_on_cluster(parallel_profile, train_estimator,
                                list(self.keys()), list(self.values()),
                                [X] * len(self), labels,
                                [sample_weight] * len(self))
        for status, data in result:
            if status == 'success':
                name, estimator, spent_time = data
                self[name] = estimator
                print('model {:12} was trained in {:.2f} seconds'.format(
                    name, spent_time))
            else:
                print('Problem while training on the node, report:\n', data)

        print("Totally spent {:.2f} seconds on training".format(time.time() -
                                                                start_time))
        return self
예제 #3
0
    def fit(self, X, y, sample_weight=None):
        """
        Train the classifier, will train several base classifiers on overlapping
        subsets of training dataset.

        :param X: pandas.DataFrame of shape [n_samples, n_features]
        :param y: labels of events - array-like of shape [n_samples]
        :param sample_weight: weight of events,
               array-like of shape [n_samples] or None if all weights are equal
        """
        if hasattr(self.base_estimator, 'features'):
            assert self.base_estimator.features is None, \
                'Base estimator must have None features! Use features parameter in Folding instead'
        self.train_length = len(X)
        group_column, (X, y, sample_weight) = self._prepare_data(X, y, sample_weight)
        folds_column = self._get_folds_column(len(X), group_column)

        for _ in range(self.n_folds):
            self.estimators.append(clone(self.base_estimator))

        if sample_weight is None:
            weights_iterator = [None] * self.n_folds
        else:
            weights_iterator = (sample_weight[folds_column != index] for index in range(self.n_folds))

        result = map_on_cluster(self.parallel_profile, train_estimator,
                                range(len(self.estimators)),
                                self.estimators,
                                (X.iloc[folds_column != index, :].copy() for index in range(self.n_folds)),
                                (y[folds_column != index] for index in range(self.n_folds)),
                                weights_iterator)
        for status, data in result:
            if status == 'success':
                name, classifier, spent_time = data
                self.estimators[name] = classifier
            else:
                print('Problem while training on the node, report:\n', data)
        return self
예제 #4
0
    def fit(self, X, y, sample_weight=None, parallel_profile=None, features=None):
        """
        Train all estimators on the same data.
        :param X: pandas.DataFrame of shape [n_samples, n_features] with features
        :param y: array-like of shape [n_samples] with labels of samples
        :param sample_weight: weights of events,
        array-like of shape [n_samples] or None if all weights are equal
        :param features: features to train estimators
        If None, estimators will be trained on `estimator.features`
        :type features: None or list[str]
        :param parallel_profile: profile of parallel execution system or None
        :type parallel_profile: None or str
        :return: self
        """
        if features is not None:
            for name, estimator in self.items():
                if estimator.features is not None:
                    print('Overwriting features of estimator ' + name)
                self[name].set_params(features=features)

        start_time = time.time()
        labels = []
        for key in self.keys():
            labels.append((y == names_labels_correspondence[key]) * 1)
        result = map_on_cluster(parallel_profile, train_estimator, list(self.keys()), list(self.values()),
                                [X] * len(self), labels, [sample_weight] * len(self))
        for status, data in result:
            if status == 'success':
                name, estimator, spent_time = data
                self[name] = estimator
                print('model {:12} was trained in {:.2f} seconds'.format(name, spent_time))
            else:
                print('Problem while training on the node, report:\n', data)

        print("Totally spent {:.2f} seconds on training".format(time.time() - start_time))
        return self
예제 #5
0
    def fit(self, X, y, parallel_profile=None, **params):
        if self.multi_mode:
            self.base_estimator.fit(X, y, **params)
        else:
            start_time = time.time()
            labels = []
            if len(self.models) == 0:
                keys = numpy.unique(y)
                for key in keys:
                    labels.append((y == key) * 1)
                    self.models[key] = clone(self.base_estimator)
            else:
                for key in self.models.keys():
                    labels.append((y == key) * 1)
            sample_weight = numpy.ones(
                len(X)
            ) if 'sample_weight' not in params else params['sample_weight']
            result = map_on_cluster(parallel_profile, train_estimator,
                                    list(self.models.keys()),
                                    list(self.models.values()),
                                    [X] * len(self.models), labels,
                                    [sample_weight] * len(self.models))
            for status, data in result:
                if status == 'success':
                    name, estimator, spent_time = data
                    self.models[name] = estimator
                    print('model {:12} was trained in {:.2f} seconds'.format(
                        name, spent_time))
                else:
                    print('Problem while training on the node, report:\n',
                          data)

            print(
                "Totally spent {:.2f} seconds on training".format(time.time() -
                                                                  start_time))
        return self