def pytest_runtest_setup(item): """Set the number of openmp threads based on the number of workers xdist is using to prevent oversubscription. Parameters ---------- item : pytest item item to be processed """ try: xdist_worker_count = int(os.environ['PYTEST_XDIST_WORKER_COUNT']) except KeyError: # raises when pytest-xdist is not installed return openmp_threads = _openmp_effective_n_threads() threads_per_worker = max(openmp_threads // xdist_worker_count, 1) threadpool_limits(threads_per_worker, user_api='openmp')
def pytest_runtest_setup(item): """Set the number of openmp threads based on the number of workers xdist is using to prevent oversubscription. Parameters ---------- item : pytest item item to be processed """ xdist_worker_count = environ.get("PYTEST_XDIST_WORKER_COUNT") if xdist_worker_count is None: # returns if pytest-xdist is not installed return else: xdist_worker_count = int(xdist_worker_count) openmp_threads = _openmp_effective_n_threads() threads_per_worker = max(openmp_threads // xdist_worker_count, 1) threadpool_limits(threads_per_worker, user_api="openmp")
from sklearn.base import is_regressor from sklearn.pipeline import make_pipeline from sklearn.metrics import mean_poisson_deviance from sklearn.dummy import DummyRegressor from sklearn.exceptions import NotFittedError from sklearn.compose import make_column_transformer from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE from sklearn.utils import shuffle from sklearn.utils._openmp_helpers import _openmp_effective_n_threads n_threads = _openmp_effective_n_threads() _LOSSES = { "squared_error": HalfSquaredError, "absolute_error": AbsoluteError, "poisson": HalfPoissonLoss, "quantile": PinballLoss, "binary_crossentropy": HalfBinomialLoss, "categorical_crossentropy": HalfMultinomialLoss, } X_classification, y_classification = make_classification(random_state=0) X_regression, y_regression = make_regression(random_state=0) X_multi_classification, y_multi_classification = make_classification( n_classes=3, n_informative=3, random_state=0)
def _fit(self, X, y=None, sample_weight=None): """Compute k-means clustering. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Training instances to cluster. It must be noted that the data will be converted to C ordering, which will cause a memory copy if the given data is not C-contiguous. y : Ignored not used, present here for API consistency by convention. sample_weight : array-like, shape (n_samples,), optional The weights for each observation in X. If None, all observations are assigned equal weight (default: None) """ if hasattr(self, 'precompute_distances'): if self.precompute_distances != 'deprecated': if sklearn_check_version('0.24'): warnings.warn( "'precompute_distances' was deprecated in version " "0.23 and will be removed in 1.0 (renaming of 0.25)." " It has no effect", FutureWarning) elif sklearn_check_version('0.23'): warnings.warn( "'precompute_distances' was deprecated in version " "0.23 and will be removed in 0.25. It has no " "effect", FutureWarning) self._n_threads = None if hasattr(self, 'n_jobs'): if self.n_jobs != 'deprecated': if sklearn_check_version('0.24'): warnings.warn( "'n_jobs' was deprecated in version 0.23 and will be" " removed in 1.0 (renaming of 0.25).", FutureWarning) elif sklearn_check_version('0.23'): warnings.warn( "'n_jobs' was deprecated in version 0.23 and will be" " removed in 0.25.", FutureWarning) self._n_threads = self.n_jobs self._n_threads = _openmp_effective_n_threads(self._n_threads) if self.n_init <= 0: raise ValueError(f"n_init should be > 0, got {self.n_init} instead.") random_state = check_random_state(self.random_state) if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) if self.max_iter <= 0: raise ValueError( f"max_iter should be > 0, got {self.max_iter} instead.") algorithm = self.algorithm if algorithm == "elkan" and self.n_clusters == 1: warnings.warn( "algorithm='elkan' doesn't make sense for a single " "cluster. Using 'full' instead.", RuntimeWarning) algorithm = "full" if algorithm == "auto": algorithm = "full" if self.n_clusters == 1 else "elkan" if algorithm not in ["full", "elkan"]: raise ValueError("Algorithm must be 'auto', 'full' or 'elkan', got" " {}".format(str(algorithm))) X_len = _num_samples(X) _patching_status = PatchingConditionsChain("sklearn.cluster.KMeans.fit") _dal_ready = _patching_status.and_conditions([ (self.n_clusters <= X_len, "The number of clusters is larger than the number of samples in X.") ]) if _dal_ready and sample_weight is not None: if isinstance(sample_weight, numbers.Number): sample_weight = np.full(X_len, sample_weight, dtype=np.float64) else: sample_weight = np.asarray(sample_weight) _dal_ready = _patching_status.and_conditions([ (sample_weight.shape == (X_len, ), "Sample weights do not have the same length as X."), (np.allclose(sample_weight, np.ones_like(sample_weight)), "Sample weights are not ones.") ]) _patching_status.write_log() if _dal_ready: X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32]) self.n_features_in_ = X.shape[1] self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \ _daal4py_k_means_fit( X, self.n_clusters, self.max_iter, self.tol, self.init, self.n_init, self.verbose, random_state) else: super(KMeans, self).fit(X, y=y, sample_weight=sample_weight) return self
def _fit(self, X, y=None, sample_weight=None): """Compute k-means clustering. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Training instances to cluster. It must be noted that the data will be converted to C ordering, which will cause a memory copy if the given data is not C-contiguous. y : Ignored not used, present here for API consistency by convention. sample_weight : array-like, shape (n_samples,), optional The weights for each observation in X. If None, all observations are assigned equal weight (default: None) """ if self.precompute_distances != 'deprecated': warnings.warn("'precompute_distances' was deprecated in version " "0.23 and will be removed in 0.25. It has no " "effect", FutureWarning) if self.n_jobs != 'deprecated': warnings.warn("'n_jobs' was deprecated in version 0.23 and will be" " removed in 0.25.", FutureWarning) self._n_threads = self.n_jobs else: self._n_threads = None self._n_threads = _openmp_effective_n_threads(self._n_threads) if self.n_init <= 0: raise ValueError( f"n_init should be > 0, got {self.n_init} instead.") random_state = check_random_state(self.random_state) if self.max_iter <= 0: raise ValueError( f"max_iter should be > 0, got {self.max_iter} instead.") algorithm = self.algorithm if algorithm == "elkan" and self.n_clusters == 1: warnings.warn("algorithm='elkan' doesn't make sense for a single " "cluster. Using 'full' instead.", RuntimeWarning) algorithm = "full" if algorithm == "auto": algorithm = "full" if self.n_clusters == 1 else "elkan" if algorithm not in ["full", "elkan"]: raise ValueError("Algorithm must be 'auto', 'full' or 'elkan', got" " {}".format(str(algorithm))) daal_ready = True if daal_ready: X_len = _num_samples(X) daal_ready = (self.n_clusters <= X_len) if daal_ready and sample_weight is not None: sample_weight = np.asarray(sample_weight) daal_ready = (sample_weight.shape == (X_len,)) and ( np.allclose(sample_weight, np.ones_like(sample_weight))) if daal_ready: logging.info("sklearn.cluster.KMeans.fit: " + get_patch_message("daal")) X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32]) self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \ _daal4py_k_means_fit( X, self.n_clusters, self.max_iter, self.tol, self.init, self.n_init, self.verbose, random_state) else: logging.info("sklearn.cluster.KMeans.fit: " + get_patch_message("sklearn")) super(KMeans, self).fit(X, y=y, sample_weight=sample_weight) return self
parser.add_argument('--bhtsne', action='store_true', help="if set and the reference bhtsne code is " "correctly installed, run it in the benchmark.") parser.add_argument('--all', action='store_true', help="if set, run the benchmark with the whole MNIST." "dataset. Note that it will take up to 1 hour.") parser.add_argument('--profile', action='store_true', help="if set, run the benchmark with a memory " "profiler.") parser.add_argument('--verbose', type=int, default=0) parser.add_argument('--pca-components', type=int, default=50, help="Number of principal components for " "preprocessing.") args = parser.parse_args() print("Used number of threads: {}".format(_openmp_effective_n_threads())) X, y = load_data(order=args.order) if args.pca_components > 0: t0 = time() X = PCA(n_components=args.pca_components).fit_transform(X) print("PCA preprocessing down to {} dimensions took {:0.3f}s" .format(args.pca_components, time() - t0)) methods = [] # Put TSNE in methods tsne = TSNE(n_components=2, init='pca', perplexity=args.perplexity, verbose=args.verbose, n_iter=1000) methods.append(("sklearn TSNE", lambda data: tsne_fit_transform(tsne, data)))
def fit(self, X, y=None, sample_weight=None): """Compute k-means clustering. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Training instances to cluster. It must be noted that the data will be converted to C ordering, which will cause a memory copy if the given data is not C-contiguous. y : Ignored not used, present here for API consistency by convention. sample_weight : array-like, shape (n_samples,), optional The weights for each observation in X. If None, all observations are assigned equal weight (default: None) """ if self.precompute_distances != 'deprecated': warnings.warn("'precompute_distances' was deprecated in version " "0.23 and will be removed in 0.25. It has no " "effect", FutureWarning) if self.n_jobs != 'deprecated': warnings.warn("'n_jobs' was deprecated in version 0.23 and will be" " removed in 0.25.", FutureWarning) self._n_threads = self.n_jobs else: self._n_threads = None self._n_threads = _openmp_effective_n_threads(self._n_threads) if self.n_init <= 0: raise ValueError("Invalid number of initializations." " n_init=%d must be bigger than zero." % self.n_init) random_state = check_random_state(self.random_state) if self.max_iter <= 0: raise ValueError('Number of iterations should be a positive number,' ' got %d instead' % self.max_iter) # avoid forcing order when copy_x=False order = "C" if self.copy_x else None X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32], order=order, copy=self.copy_x) algorithm = self.algorithm if algorithm == "elkan" and self.n_clusters == 1: warnings.warn("algorithm='elkan' doesn't make sense for a single " "cluster. Using 'full' instead.", RuntimeWarning) algorithm = "full" if algorithm == "auto": algorithm = "full" if self.n_clusters == 1 else "elkan" if algorithm not in ["full", "elkan"]: raise ValueError("Algorithm must be 'auto', 'full' or 'elkan', got" " {}".format(str(algorithm))) daal_ready = not sp.issparse(X) daal_ready = daal_ready and hasattr(X, '__array__') if daal_ready: X_len = _num_samples(X) daal_ready = (self.n_clusters <= X_len) if daal_ready and sample_weight is not None: sample_weight = np.asarray(sample_weight) daal_ready = (sample_weight.shape == (X_len,)) and ( np.allclose(sample_weight, np.ones_like(sample_weight))) if daal_ready: X = check_array(X, dtype=[np.float64, np.float32]) self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \ _daal4py_k_means_dense( X, self.n_clusters, self.max_iter, self.tol, self.init, self.n_init, random_state) else: super(KMeans, self).fit(X, y=y, sample_weight=sample_weight) return self
def _tsne(self, P, degrees_of_freedom, n_samples, X_embedded, neighbors=None, skip_num_points=0): """Runs t-SNE.""" # t-SNE minimizes the Kullback-Leiber divergence of the Gaussians P # and the Student's t-distributions Q. The optimization algorithm that # we use is batch gradient descent with two stages: # * initial optimization with early exaggeration and momentum at 0.5 # * final optimization with momentum at 0.8 self.n_samples = n_samples params = X_embedded.ravel() opt_args = { "it": 0, "n_iter_check": self._N_ITER_CHECK, "min_grad_norm": self.min_grad_norm, "learning_rate": self.learning_rate, "verbose": self.verbose, "kwargs": dict(skip_num_points=skip_num_points), "args": [P, degrees_of_freedom, n_samples, self.n_components], "n_iter_without_progress": self._EXPLORATION_N_ITER, "n_iter": self._EXPLORATION_N_ITER, "momentum": 0.5, } if self.method == 'barnes_hut': obj_func = _kl_divergence_bh opt_args['kwargs']['angle'] = self.angle # Repeat verbose argument for _kl_divergence_bh opt_args['kwargs']['verbose'] = self.verbose # Get the number of threads for gradient computation here to # avoid recomputing it at each iteration. opt_args['kwargs']['num_threads'] = _openmp_effective_n_threads() else: obj_func = _kl_divergence # Learning schedule (part 1): do 250 iteration with lower momentum but # higher learning rate controlled via the early exaggeration parameter P *= self.early_exaggeration params, kl_divergence, it = self._gradient_descent( obj_func, params, **opt_args) if self.verbose: print("[t-SNE] KL divergence after %d iterations with early " "exaggeration: %f" % (it + 1, kl_divergence)) # Learning schedule (part 2): disable early exaggeration and finish # optimization with a higher momentum at 0.8 P /= self.early_exaggeration remaining = self.n_iter - self._EXPLORATION_N_ITER if it < self._EXPLORATION_N_ITER or remaining > 0: opt_args['n_iter'] = self.n_iter opt_args['it'] = it + 1 opt_args['momentum'] = 0.8 opt_args['n_iter_without_progress'] = self.n_iter_without_progress params, kl_divergence, it = self._gradient_descent( obj_func, params, **opt_args) # Save the final number of iterations self.n_iter_ = it if self.verbose: print("[t-SNE] KL divergence after %d iterations: %f" % (it + 1, kl_divergence)) X_embedded = params.reshape(n_samples, self.n_components) self.kl_divergence_ = kl_divergence return X_embedded
Benchmark script for HistGradientBoostingClassifier. Output written as "bench_loss_module_hgbt.parquet" """ from collections import OrderedDict from neurtu import delayed, Benchmark import numpy as np import pandas as pd from sklearn.datasets import make_classification from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.linear_model import LogisticRegression from sklearn.utils._openmp_helpers import _openmp_parallelism_enabled, _openmp_effective_n_threads print("openmp enabled: ", _openmp_parallelism_enabled()) print("openmp threads: ", _openmp_effective_n_threads()) n_threads = _openmp_effective_n_threads() n_samples, n_features = 100_000, 20 n_informative = int(n_features * 0.9) bench_options = { "wall_time": True, "cpu_time": True, "peak_memory": True, "repeat": 20, } early_stopping = [True, False] options = {} def benchmark_cases(X, y):
def __init__( self, X_binned, gradients, hessians, max_leaf_nodes=None, max_depth=None, min_samples_leaf=20, min_gain_to_split=0.0, n_bins=256, n_bins_non_missing=None, has_missing_values=False, is_categorical=None, monotonic_cst=None, l2_regularization=0.0, min_hessian_to_split=1e-3, shrinkage=1.0, n_threads=None, ): self._validate_parameters( X_binned, max_leaf_nodes, max_depth, min_samples_leaf, min_gain_to_split, l2_regularization, min_hessian_to_split, ) n_threads = _openmp_effective_n_threads(n_threads) if n_bins_non_missing is None: n_bins_non_missing = n_bins - 1 if isinstance(n_bins_non_missing, numbers.Integral): n_bins_non_missing = np.array([n_bins_non_missing] * X_binned.shape[1], dtype=np.uint32) else: n_bins_non_missing = np.asarray(n_bins_non_missing, dtype=np.uint32) if isinstance(has_missing_values, bool): has_missing_values = [has_missing_values] * X_binned.shape[1] has_missing_values = np.asarray(has_missing_values, dtype=np.uint8) if monotonic_cst is None: self.with_monotonic_cst = False monotonic_cst = np.full( shape=X_binned.shape[1], fill_value=MonotonicConstraint.NO_CST, dtype=np.int8, ) else: self.with_monotonic_cst = True monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8) if monotonic_cst.shape[0] != X_binned.shape[1]: raise ValueError( "monotonic_cst has shape {} but the input data " "X has {} features.".format(monotonic_cst.shape[0], X_binned.shape[1])) if np.any(monotonic_cst < -1) or np.any(monotonic_cst > 1): raise ValueError( "monotonic_cst must be None or an array-like of -1, 0 or 1." ) if is_categorical is None: is_categorical = np.zeros(shape=X_binned.shape[1], dtype=np.uint8) else: is_categorical = np.asarray(is_categorical, dtype=np.uint8) if np.any( np.logical_and(is_categorical == 1, monotonic_cst != MonotonicConstraint.NO_CST)): raise ValueError( "Categorical features cannot have monotonic constraints.") hessians_are_constant = hessians.shape[0] == 1 self.histogram_builder = HistogramBuilder(X_binned, n_bins, gradients, hessians, hessians_are_constant, n_threads) missing_values_bin_idx = n_bins - 1 self.splitter = Splitter( X_binned, n_bins_non_missing, missing_values_bin_idx, has_missing_values, is_categorical, monotonic_cst, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split, hessians_are_constant, n_threads, ) self.n_bins_non_missing = n_bins_non_missing self.missing_values_bin_idx = missing_values_bin_idx self.max_leaf_nodes = max_leaf_nodes self.has_missing_values = has_missing_values self.monotonic_cst = monotonic_cst self.is_categorical = is_categorical self.l2_regularization = l2_regularization self.n_features = X_binned.shape[1] self.max_depth = max_depth self.min_samples_leaf = min_samples_leaf self.X_binned = X_binned self.min_gain_to_split = min_gain_to_split self.shrinkage = shrinkage self.n_threads = n_threads self.splittable_nodes = [] self.finalized_leaves = [] self.total_find_split_time = 0.0 # time spent finding the best splits self.total_compute_hist_time = 0.0 # time spent computing histograms self.total_apply_split_time = 0.0 # time spent splitting nodes self.n_categorical_splits = 0 self._intilialize_root(gradients, hessians, hessians_are_constant) self.n_nodes = 1
def learn_clusters(n_clust): client = Client(n_workers=4, processes=True) # 1. Learn clusters # Full set kmeans_path = 'Clustering/KMeans/n{}posts.joblib'.format(n_clust) array = da.from_npy_stack(npy_stack_path) kmeans = KMeans(n_clusters=n_clust) # Learn on a part of set # array = np.load('Clustering/npy_post_vecs_part/0.npy') # kmeans = SKMeans(n_clusters=n_clust) print('Fitting') kmeans.fit(array) del array # Dump centroids to the disk # Dump as a sklearn object, for (maybe) faster prediction and less problems skmeans = SKMeans(n_clusters=n_clust) skmeans.cluster_centers_ = kmeans.cluster_centers_ skmeans._n_threads = _openmp_effective_n_threads() dump(skmeans, kmeans_path) del kmeans, skmeans # dump(kmeans, kmeans_path) # For learning on a part of set # del kmeans print('Fitted') # 3. Turn posts into clusters kmeans_path = 'Clustering/KMeans/n{}posts.joblib'.format(n_clust) df = dd.read_parquet('preprocessed.parquet') df = df.map_partitions(df_to_vector_predict, kmeans_path, meta={ 'user_id': int, 'post_id': int, 'text': object, 'type': str, 'date': str, 'cluster': int }) df.to_parquet('Clustering/KMeans/n{}posts.parquet'.format(n_clust)) print('Clustered') # 2.5. Filter outdated posts out. (The next time write date of parsing to user_info) # For each user find his last like and filter out likes that are older than the last + half a year df = dd.read_parquet('Clustering/KMeans/n{}posts.parquet'.format(n_clust)) print('Original df len: {}'.format(len(df))) year = 31536000 # One year in timestamp kyear = 20 break_time = kyear * year # 0.75*year - A quarter to year last_like = df['date'].max().compute( ) # Set has been fully collected on 8 of June 2020 df = df[df['date'] > last_like - break_time] # Pass only a quarter-to-year recent likes print('max_date: {} '.format(df['date'].max().compute())) print('min date: {}'.format(df['date'].min().compute())) print('Filtered df len: {}'.format(len(df))) print('Likes has been filtered out by date') # 3. Group clusters by user_id and turn them into a single vector for each user # df = dd.read_parquet('Clustering/KMeans/n{}posts.parquet'.format(n_clust)) # INSTEAD OF FILTER! # - Count text_likes number for each user (and later merge with user_info) count = df.drop(columns=['post_id', 'type', 'date', 'cluster']).groupby( 'user_id')['text'].count().compute() count.rename('text_likes', inplace=True) # Generate meta meta = {'user_id': int} for i in range(n_clust): meta[i] = float df = df.map_partitions( lambda df_part: kt.clusters_to_vector(df_part, n_clust), meta=meta) df.to_parquet( 'Clustering/KMeans/n{}posts-cluster_vecs.parquet'.format(n_clust)) # 5. Merge clusters and user_info dataframes. (Working with pandas frames) df_info = pd.read_csv('users_info.csv') df_info = df_info.merge(count, on='user_id', how='inner') del count df = pd.read_parquet( 'Clustering/KMeans/n{}posts-cluster_vecs.parquet'.format(n_clust)) df = df_info.merge( df, on='user_id', how='inner' ) # Merging user's info and clusters. Maybe, mistake is here df.to_csv('Clustering/KMeans/n{}-final_dataset-{}year.csv'.format( n_clust, kyear)) print('Final dataset has been saved') del df_info # Filter some users out # df = pd.read_csv('Clustering/KMeans/n{}-final_dataset.csv'.format(n_clust)).drop(columns=['Unnamed: 0']) # TESTING df = df.loc[(df['text_likes'] > 100) & (df['text_likes'] < 1000)] df['bdate'] = df['bdate'].apply( lambda bd: time.mktime(datetime.strptime(bd, "%d.%m.%Y").timetuple())) # Clean up the dataset df = df.drop(columns=[ 'posts_n', 'text_likes', 'status', 'sex', 'smoking', 'alcohol', 'parth_id', 'country', 'city', 'user_id' ]).dropna().reset_index(drop=True) # 6. Supervise a Linear Regression model regr = LinearRegression() R2 = train(df, regr) client.close() return R2