def fit(self, X, Y): import sklearn.svm from sklearn.feature_selection import SelectFromModel self.C = float(self.C) self.tol = float(self.tol) self.dual = check_for_bool(self.dual) self.fit_intercept = check_for_bool(self.fit_intercept) self.intercept_scaling = float(self.intercept_scaling) if check_none(self.class_weight): self.class_weight = None estimator = sklearn.svm.LinearSVC( penalty=self.penalty, loss=self.loss, dual=self.dual, tol=self.tol, C=self.C, class_weight=self.class_weight, fit_intercept=self.fit_intercept, intercept_scaling=self.intercept_scaling, multi_class=self.multi_class, random_state=self.random_state) estimator.fit(X, Y) self.preprocessor = SelectFromModel(estimator=estimator, threshold='mean', prefit=True) return self
def fit(self, X, Y): import sklearn.svm import sklearn.multiclass self.C = float(self.C) self.tol = float(self.tol) self.dual = check_for_bool(self.dual) self.fit_intercept = check_for_bool(self.fit_intercept) self.intercept_scaling = float(self.intercept_scaling) if check_none(self.class_weight): self.class_weight = None estimator = sklearn.svm.LinearSVC(penalty=self.penalty, loss=self.loss, dual=self.dual, tol=self.tol, C=self.C, class_weight=self.class_weight, fit_intercept=self.fit_intercept, intercept_scaling=self.intercept_scaling, multi_class=self.multi_class, random_state=self.random_state) if len(Y.shape) == 2 and Y.shape[1] > 1: self.estimator = sklearn.multiclass.OneVsRestClassifier( estimator, n_jobs=1) else: self.estimator = estimator self.estimator.fit(X, Y) return self
def fit(self, X, Y): import sklearn.preprocessing self.degree = int(self.degree) self.interaction_only = check_for_bool(self.interaction_only) self.include_bias = check_for_bool(self.include_bias) self.preprocessor = sklearn.preprocessing.PolynomialFeatures( degree=self.degree, interaction_only=self.interaction_only, include_bias=self.include_bias) self.preprocessor.fit(X, Y) return self
def _fit(self, X, Y=None): import sklearn.decomposition self.whiten = check_for_bool(self.whiten) if check_none(self.n_components): self.n_components = None else: self.n_components = int(self.n_components) self.preprocessor = sklearn.decomposition.FastICA( n_components=self.n_components, algorithm=self.algorithm, fun=self.fun, whiten=self.whiten, random_state=self.random_state) # Make the RuntimeWarning an Exception! with warnings.catch_warnings(): warnings.filterwarnings( "error", message='array must not contain infs or NaNs') try: return self.preprocessor.fit_transform(X) except ValueError as e: if 'array must not contain infs or NaNs' in e.args[0]: raise ValueError( "Bug in scikit-learn: https://github.com/scikit-learn/scikit-learn/pull/2738" ) return self
def _fit(self, X, Y=None): import sklearn.ensemble self.n_estimators = int(self.n_estimators) if check_none(self.max_depth): self.max_depth = None else: self.max_depth = int(self.max_depth) self.min_samples_split = int(self.min_samples_split) self.min_samples_leaf = int(self.min_samples_leaf) if check_none(self.max_leaf_nodes): self.max_leaf_nodes = None else: self.max_leaf_nodes = int(self.max_leaf_nodes) self.min_weight_fraction_leaf = float(self.min_weight_fraction_leaf) self.bootstrap = check_for_bool(self.bootstrap) self.preprocessor = sklearn.ensemble.RandomTreesEmbedding( n_estimators=self.n_estimators, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, max_leaf_nodes=self.max_leaf_nodes, sparse_output=self.sparse_output, n_jobs=self.n_jobs, random_state=self.random_state ) self.preprocessor.fit(X, Y) return self
def iterative_fit(self, X, y, sample_weight=None, n_iter=1, refit=False): from sklearn.ensemble import RandomForestClassifier if refit: self.estimator = None if self.estimator is None: self.n_estimators = int(self.n_estimators) if check_none(self.max_depth): self.max_depth = None else: self.max_depth = int(self.max_depth) self.min_samples_split = int(self.min_samples_split) self.min_samples_leaf = int(self.min_samples_leaf) self.min_weight_fraction_leaf = float( self.min_weight_fraction_leaf) if self.max_features not in ("sqrt", "log2", "auto"): max_features = int(X.shape[1] ** float(self.max_features)) else: max_features = self.max_features self.bootstrap = check_for_bool(self.bootstrap) if check_none(self.max_leaf_nodes): self.max_leaf_nodes = None else: self.max_leaf_nodes = int(self.max_leaf_nodes) self.min_impurity_decrease = float(self.min_impurity_decrease) # initial fit of only increment trees self.estimator = RandomForestClassifier( n_estimators=n_iter, criterion=self.criterion, max_features=max_features, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, bootstrap=self.bootstrap, max_leaf_nodes=self.max_leaf_nodes, min_impurity_decrease=self.min_impurity_decrease, random_state=self.random_state, n_jobs=self.n_jobs, class_weight=self.class_weight, warm_start=True) else: self.estimator.n_estimators += n_iter self.estimator.n_estimators = min(self.estimator.n_estimators, self.n_estimators) self.estimator.fit(X, y, sample_weight=sample_weight) return self
def fit(self, X, Y=None): import sklearn.decomposition n_components = float(self.keep_variance) self.whiten = check_for_bool(self.whiten) self.preprocessor = sklearn.decomposition.PCA( n_components=n_components, whiten=self.whiten, copy=True) self.preprocessor.fit(X) if not np.isfinite(self.preprocessor.components_).all(): raise ValueError("PCA found non-finite components.") return self
def iterative_fit(self, X, y, n_iter=1, refit=False): import sklearn.naive_bayes import scipy.sparse if refit: self.estimator = None if self.estimator is None: self.fit_prior = check_for_bool(self.fit_prior) self.alpha = float(self.alpha) self.n_iter = 0 self.fully_fit_ = False self.estimator = sklearn.naive_bayes.MultinomialNB( alpha=self.alpha, fit_prior=self.fit_prior) self.classes_ = np.unique(y.astype(int)) # Because the pipeline guarantees that each feature is positive, # clip all values below zero to zero if scipy.sparse.issparse(X): X.data[X.data < 0] = 0.0 else: X[X < 0] = 0.0 # Fallback for multilabel classification if len(y.shape) > 1 and y.shape[1] > 1: import sklearn.multiclass self.estimator.n_iter = self.n_iter self.estimator = sklearn.multiclass.OneVsRestClassifier( self.estimator, n_jobs=1) self.estimator.fit(X, y) self.fully_fit_ = True else: for iter in range(n_iter): start = min(self.n_iter * 1000, y.shape[0]) stop = min((self.n_iter + 1) * 1000, y.shape[0]) if X[start:stop].shape[0] == 0: self.fully_fit_ = True break self.estimator.partial_fit(X[start:stop], y[start:stop], self.classes_) self.n_iter += 1 if stop >= len(y): self.fully_fit_ = True break return self
def fit(self, X, Y, sample_weight=None): from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel self.n_estimators = int(self.n_estimators) if check_none(self.max_leaf_nodes): self.max_leaf_nodes = None else: self.max_leaf_nodes = int(self.max_leaf_nodes) if check_none(self.max_depth): self.max_depth = None else: self.max_depth = int(self.max_depth) self.bootstrap = check_for_bool(self.bootstrap) self.n_jobs = int(self.n_jobs) self.min_impurity_decrease = float(self.min_impurity_decrease) self.max_features = self.max_features self.min_samples_leaf = int(self.min_samples_leaf) self.min_samples_split = int(self.min_samples_split) self.verbose = int(self.verbose) max_features = int(X.shape[1]**float(self.max_features)) estimator = ExtraTreesClassifier( n_estimators=self.n_estimators, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, min_impurity_decrease=self.min_impurity_decrease, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state, class_weight=self.class_weight) estimator.fit(X, Y, sample_weight=sample_weight) self.preprocessor = SelectFromModel(estimator=estimator, threshold='mean', prefit=True) return self
def __init__(self, n_estimators, criterion, min_samples_leaf, min_samples_split, max_features, bootstrap, max_leaf_nodes, max_depth, min_weight_fraction_leaf, min_impurity_decrease, oob_score=False, n_jobs=1, random_state=None, verbose=0, class_weight=None): self.n_estimators = int(n_estimators) self.estimator_increment = 10 if criterion not in ("gini", "entropy"): raise ValueError("'criterion' is not in ('gini', 'entropy'): " "%s" % criterion) self.criterion = criterion if check_none(max_depth): self.max_depth = None else: self.max_depth = int(max_depth) if check_none(max_leaf_nodes): self.max_leaf_nodes = None else: self.max_leaf_nodes = int(max_leaf_nodes) self.min_samples_leaf = int(min_samples_leaf) self.min_samples_split = int(min_samples_split) self.max_features = float(max_features) self.bootstrap = check_for_bool(bootstrap) self.min_weight_fraction_leaf = float(min_weight_fraction_leaf) self.min_impurity_decrease = float(min_impurity_decrease) self.oob_score = oob_score self.n_jobs = int(n_jobs) self.random_state = random_state self.verbose = int(verbose) self.class_weight = class_weight self.estimator = None
def iterative_fit(self, X, y, n_iter=1, refit=False): import sklearn.naive_bayes if refit: self.estimator = None if self.estimator is None: self.n_iter = 0 self.fully_fit_ = False self.fit_prior = check_for_bool(self.fit_prior) self.estimator = sklearn.naive_bayes.BernoulliNB( alpha=self.alpha, fit_prior=self.fit_prior) self.classes_ = np.unique(y.astype(int)) # Fallback for multilabel classification if len(y.shape) > 1 and y.shape[1] > 1: import sklearn.multiclass self.estimator.n_iter = self.n_iter self.estimator = sklearn.multiclass.OneVsRestClassifier( self.estimator, n_jobs=1) self.estimator.fit(X, y) self.fully_fit_ = True else: for iter in range(n_iter): start = min(self.n_iter * 1000, y.shape[0]) stop = min((self.n_iter + 1) * 1000, y.shape[0]) # Upper limit, scipy.sparse doesn't seem to handle max > len(matrix) stop = min(stop, y.shape[0]) if X[start:stop].shape[0] == 0: self.fully_fit_ = True break self.estimator.partial_fit(X[start:stop], y[start:stop], self.classes_) self.n_iter += 1 if stop >= len(y): self.fully_fit_ = True break return self
def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): from sklearn.linear_model.passive_aggressive import \ PassiveAggressiveClassifier # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. if refit: self.estimator = None if self.estimator is None: self.fully_fit_ = False self.average = check_for_bool(self.average) self.fit_intercept = check_for_bool(self.fit_intercept) self.tol = float(self.tol) self.C = float(self.C) call_fit = True self.estimator = PassiveAggressiveClassifier( C=self.C, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, loss=self.loss, shuffle=True, random_state=self.random_state, warm_start=True, average=self.average, ) self.classes_ = np.unique(y.astype(int)) else: call_fit = False # Fallback for multilabel classification if len(y.shape) > 1 and y.shape[1] > 1: import sklearn.multiclass self.estimator.max_iter = 50 self.estimator = sklearn.multiclass.OneVsRestClassifier( self.estimator, n_jobs=1) self.estimator.fit(X, y) self.fully_fit_ = True else: if call_fit: self.estimator.fit(X, y) else: self.estimator.max_iter += n_iter self.estimator.max_iter = min(self.estimator.max_iter, 1000) self.estimator._validate_params() lr = "pa1" if self.estimator.loss == "hinge" else "pa2" self.estimator._partial_fit(X, y, alpha=1.0, C=self.estimator.C, loss="hinge", learning_rate=lr, max_iter=n_iter, classes=None, sample_weight=sample_weight, coef_init=None, intercept_init=None) if (self.estimator._max_iter >= 1000 or n_iter > self.estimator.n_iter_): self.fully_fit_ = True return self
def fit(self, X, Y): import sklearn.svm # Calculate the size of the kernel cache (in MB) for sklearn's LibSVM. The cache size is # calculated as 2/3 of the available memory (which is calculated as the memory limit minus # the used memory) try: # Retrieve memory limits imposed on the process soft, hard = resource.getrlimit(resource.RLIMIT_AS) if soft > 0: # Convert limit to units of megabytes soft /= 1024 * 1024 # Retrieve memory used by this process maxrss = resource.getrusage(resource.RUSAGE_SELF)[2] / 1024 # In MacOS, the MaxRSS output of resource.getrusage in bytes; on other platforms, # it's in kilobytes if sys.platform == 'darwin': maxrss = maxrss / 1024 cache_size = (soft - maxrss) / 1.5 if cache_size < 0: cache_size = 200 else: cache_size = 200 except Exception: cache_size = 200 self.C = float(self.C) if self.degree is None: self.degree = 3 else: self.degree = int(self.degree) if self.gamma is None: self.gamma = 0.0 else: self.gamma = float(self.gamma) if self.coef0 is None: self.coef0 = 0.0 else: self.coef0 = float(self.coef0) self.tol = float(self.tol) self.max_iter = float(self.max_iter) self.shrinking = check_for_bool(self.shrinking) if check_none(self.class_weight): self.class_weight = None self.estimator = sklearn.svm.SVC(C=self.C, kernel=self.kernel, degree=self.degree, gamma=self.gamma, coef0=self.coef0, shrinking=self.shrinking, tol=self.tol, class_weight=self.class_weight, max_iter=self.max_iter, random_state=self.random_state, cache_size=cache_size, decision_function_shape='ovr') self.estimator.fit(X, Y) return self
def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): from sklearn.linear_model.stochastic_gradient import SGDClassifier # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. if refit: self.estimator = None if self.estimator is None: self.fully_fit_ = False self.alpha = float(self.alpha) self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None \ else 0.15 self.epsilon = float(self.epsilon) if self.epsilon is not None \ else 0.1 self.eta0 = float(self.eta0) self.power_t = float(self.power_t) if self.power_t is not None \ else 0.5 self.average = check_for_bool(self.average) self.fit_intercept = check_for_bool(self.fit_intercept) self.tol = float(self.tol) self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, learning_rate=self.learning_rate, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state, warm_start=True) self.estimator.fit(X, y, sample_weight=sample_weight) else: self.estimator.max_iter += n_iter self.estimator.max_iter = min(self.estimator.max_iter, 512) self.estimator._validate_params() self.estimator._partial_fit( X, y, alpha=self.estimator.alpha, C=1.0, loss=self.estimator.loss, learning_rate=self.estimator.learning_rate, max_iter=n_iter, sample_weight=sample_weight, classes=None, coef_init=None, intercept_init=None) if self.estimator._max_iter >= 512 or n_iter > self.estimator.n_iter_: self.fully_fit_ = True return self