def fit_weights(self, X, y, sample_weight=None, parallel_profile=None, features=None): if features is not None: for name, estimator in self.items(): if estimator.features is not None: print('Overwriting features of estimator ' + name) self[name].set_params(features=features) # allow specifying different weights for each classifier if isinstance(sample_weight, OrderedDict): sample_weight = list(sample_weight.values()) else: sample_weight = [sample_weight] * len(self) start_time = time.time() result = utils.map_on_cluster(parallel_profile, train_estimator, list(self.keys()), list(self.values()), [X] * len(self), [y] * len(self), sample_weight) for status, data in result: if status == 'success': name, estimator, spent_time = data self[name] = estimator print('model {:12} was trained in {:.2f} seconds'.format( name, spent_time)) else: print('Problem while training on the node, report:\n', data) print("Totally spent {:.2f} seconds on training".format(time.time() - start_time)) return self
def fit(self, X, y, sample_weight=None, parallel_profile=None, features=None): """ Train all estimators on the same data. :param X: pandas.DataFrame of shape [n_samples, n_features] with features :param y: array-like of shape [n_samples] with labels of samples :param sample_weight: weights of events, array-like of shape [n_samples] or None if all weights are equal :param features: features to train estimators If None, estimators will be trained on `estimator.features` :type features: None or list[str] :param parallel_profile: profile of parallel execution system or None :type parallel_profile: None or str :return: self """ if features is not None: for name, estimator in self.items(): if estimator.features is not None: print('Overwriting features of estimator ' + name) self[name].set_params(features=features) start_time = time.time() labels = [] for key in self.keys(): labels.append((y == names_labels_correspondence[key]) * 1) result = map_on_cluster(parallel_profile, train_estimator, list(self.keys()), list(self.values()), [X] * len(self), labels, [sample_weight] * len(self)) for status, data in result: if status == 'success': name, estimator, spent_time = data self[name] = estimator print('model {:12} was trained in {:.2f} seconds'.format( name, spent_time)) else: print('Problem while training on the node, report:\n', data) print("Totally spent {:.2f} seconds on training".format(time.time() - start_time)) return self
def fit(self, X, y, sample_weight=None): """ Train the classifier, will train several base classifiers on overlapping subsets of training dataset. :param X: pandas.DataFrame of shape [n_samples, n_features] :param y: labels of events - array-like of shape [n_samples] :param sample_weight: weight of events, array-like of shape [n_samples] or None if all weights are equal """ if hasattr(self.base_estimator, 'features'): assert self.base_estimator.features is None, \ 'Base estimator must have None features! Use features parameter in Folding instead' self.train_length = len(X) group_column, (X, y, sample_weight) = self._prepare_data(X, y, sample_weight) folds_column = self._get_folds_column(len(X), group_column) for _ in range(self.n_folds): self.estimators.append(clone(self.base_estimator)) if sample_weight is None: weights_iterator = [None] * self.n_folds else: weights_iterator = (sample_weight[folds_column != index] for index in range(self.n_folds)) result = map_on_cluster(self.parallel_profile, train_estimator, range(len(self.estimators)), self.estimators, (X.iloc[folds_column != index, :].copy() for index in range(self.n_folds)), (y[folds_column != index] for index in range(self.n_folds)), weights_iterator) for status, data in result: if status == 'success': name, classifier, spent_time = data self.estimators[name] = classifier else: print('Problem while training on the node, report:\n', data) return self
def fit(self, X, y, sample_weight=None, parallel_profile=None, features=None): """ Train all estimators on the same data. :param X: pandas.DataFrame of shape [n_samples, n_features] with features :param y: array-like of shape [n_samples] with labels of samples :param sample_weight: weights of events, array-like of shape [n_samples] or None if all weights are equal :param features: features to train estimators If None, estimators will be trained on `estimator.features` :type features: None or list[str] :param parallel_profile: profile of parallel execution system or None :type parallel_profile: None or str :return: self """ if features is not None: for name, estimator in self.items(): if estimator.features is not None: print('Overwriting features of estimator ' + name) self[name].set_params(features=features) start_time = time.time() labels = [] for key in self.keys(): labels.append((y == names_labels_correspondence[key]) * 1) result = map_on_cluster(parallel_profile, train_estimator, list(self.keys()), list(self.values()), [X] * len(self), labels, [sample_weight] * len(self)) for status, data in result: if status == 'success': name, estimator, spent_time = data self[name] = estimator print('model {:12} was trained in {:.2f} seconds'.format(name, spent_time)) else: print('Problem while training on the node, report:\n', data) print("Totally spent {:.2f} seconds on training".format(time.time() - start_time)) return self
def fit(self, X, y, parallel_profile=None, **params): if self.multi_mode: self.base_estimator.fit(X, y, **params) else: start_time = time.time() labels = [] if len(self.models) == 0: keys = numpy.unique(y) for key in keys: labels.append((y == key) * 1) self.models[key] = clone(self.base_estimator) else: for key in self.models.keys(): labels.append((y == key) * 1) sample_weight = numpy.ones( len(X) ) if 'sample_weight' not in params else params['sample_weight'] result = map_on_cluster(parallel_profile, train_estimator, list(self.models.keys()), list(self.models.values()), [X] * len(self.models), labels, [sample_weight] * len(self.models)) for status, data in result: if status == 'success': name, estimator, spent_time = data self.models[name] = estimator print('model {:12} was trained in {:.2f} seconds'.format( name, spent_time)) else: print('Problem while training on the node, report:\n', data) print( "Totally spent {:.2f} seconds on training".format(time.time() - start_time)) return self