예제 #1
0
def pytest_runtest_setup(item):
    """Set the number of openmp threads based on the number of workers
    xdist is using to prevent oversubscription.

    Parameters
    ----------
    item : pytest item
        item to be processed
    """
    try:
        xdist_worker_count = int(os.environ['PYTEST_XDIST_WORKER_COUNT'])
    except KeyError:
        # raises when pytest-xdist is not installed
        return

    openmp_threads = _openmp_effective_n_threads()
    threads_per_worker = max(openmp_threads // xdist_worker_count, 1)
    threadpool_limits(threads_per_worker, user_api='openmp')
예제 #2
0
def pytest_runtest_setup(item):
    """Set the number of openmp threads based on the number of workers
    xdist is using to prevent oversubscription.

    Parameters
    ----------
    item : pytest item
        item to be processed
    """
    xdist_worker_count = environ.get("PYTEST_XDIST_WORKER_COUNT")
    if xdist_worker_count is None:
        # returns if pytest-xdist is not installed
        return
    else:
        xdist_worker_count = int(xdist_worker_count)

    openmp_threads = _openmp_effective_n_threads()
    threads_per_worker = max(openmp_threads // xdist_worker_count, 1)
    threadpool_limits(threads_per_worker, user_api="openmp")
예제 #3
0
from sklearn.base import is_regressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_poisson_deviance
from sklearn.dummy import DummyRegressor
from sklearn.exceptions import NotFittedError
from sklearn.compose import make_column_transformer

from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
from sklearn.utils import shuffle
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads

n_threads = _openmp_effective_n_threads()

_LOSSES = {
    "squared_error": HalfSquaredError,
    "absolute_error": AbsoluteError,
    "poisson": HalfPoissonLoss,
    "quantile": PinballLoss,
    "binary_crossentropy": HalfBinomialLoss,
    "categorical_crossentropy": HalfMultinomialLoss,
}

X_classification, y_classification = make_classification(random_state=0)
X_regression, y_regression = make_regression(random_state=0)
X_multi_classification, y_multi_classification = make_classification(
    n_classes=3, n_informative=3, random_state=0)
예제 #4
0
def _fit(self, X, y=None, sample_weight=None):
    """Compute k-means clustering.

    Parameters
    ----------
    X : array-like or sparse matrix, shape=(n_samples, n_features)
        Training instances to cluster. It must be noted that the data
        will be converted to C ordering, which will cause a memory
        copy if the given data is not C-contiguous.

    y : Ignored
        not used, present here for API consistency by convention.

    sample_weight : array-like, shape (n_samples,), optional
        The weights for each observation in X. If None, all observations
        are assigned equal weight (default: None)

    """
    if hasattr(self, 'precompute_distances'):
        if self.precompute_distances != 'deprecated':
            if sklearn_check_version('0.24'):
                warnings.warn(
                    "'precompute_distances' was deprecated in version "
                    "0.23 and will be removed in 1.0 (renaming of 0.25)."
                    " It has no effect", FutureWarning)
            elif sklearn_check_version('0.23'):
                warnings.warn(
                    "'precompute_distances' was deprecated in version "
                    "0.23 and will be removed in 0.25. It has no "
                    "effect", FutureWarning)

    self._n_threads = None
    if hasattr(self, 'n_jobs'):
        if self.n_jobs != 'deprecated':
            if sklearn_check_version('0.24'):
                warnings.warn(
                    "'n_jobs' was deprecated in version 0.23 and will be"
                    " removed in 1.0 (renaming of 0.25).", FutureWarning)
            elif sklearn_check_version('0.23'):
                warnings.warn(
                    "'n_jobs' was deprecated in version 0.23 and will be"
                    " removed in 0.25.", FutureWarning)
            self._n_threads = self.n_jobs
    self._n_threads = _openmp_effective_n_threads(self._n_threads)

    if self.n_init <= 0:
        raise ValueError(f"n_init should be > 0, got {self.n_init} instead.")

    random_state = check_random_state(self.random_state)
    if sklearn_check_version("1.0"):
        self._check_feature_names(X, reset=True)

    if self.max_iter <= 0:
        raise ValueError(
            f"max_iter should be > 0, got {self.max_iter} instead.")

    algorithm = self.algorithm
    if algorithm == "elkan" and self.n_clusters == 1:
        warnings.warn(
            "algorithm='elkan' doesn't make sense for a single "
            "cluster. Using 'full' instead.", RuntimeWarning)
        algorithm = "full"

    if algorithm == "auto":
        algorithm = "full" if self.n_clusters == 1 else "elkan"

    if algorithm not in ["full", "elkan"]:
        raise ValueError("Algorithm must be 'auto', 'full' or 'elkan', got"
                         " {}".format(str(algorithm)))

    X_len = _num_samples(X)

    _patching_status = PatchingConditionsChain("sklearn.cluster.KMeans.fit")
    _dal_ready = _patching_status.and_conditions([
        (self.n_clusters <= X_len,
         "The number of clusters is larger than the number of samples in X.")
    ])

    if _dal_ready and sample_weight is not None:
        if isinstance(sample_weight, numbers.Number):
            sample_weight = np.full(X_len, sample_weight, dtype=np.float64)
        else:
            sample_weight = np.asarray(sample_weight)
        _dal_ready = _patching_status.and_conditions([
            (sample_weight.shape == (X_len, ),
             "Sample weights do not have the same length as X."),
            (np.allclose(sample_weight, np.ones_like(sample_weight)),
             "Sample weights are not ones.")
        ])

    _patching_status.write_log()
    if _dal_ready:
        X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32])
        self.n_features_in_ = X.shape[1]
        self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \
            _daal4py_k_means_fit(
                X, self.n_clusters, self.max_iter, self.tol, self.init, self.n_init,
                self.verbose, random_state)
    else:
        super(KMeans, self).fit(X, y=y, sample_weight=sample_weight)
    return self
예제 #5
0
def _fit(self, X, y=None, sample_weight=None):
    """Compute k-means clustering.

    Parameters
    ----------
    X : array-like or sparse matrix, shape=(n_samples, n_features)
        Training instances to cluster. It must be noted that the data
        will be converted to C ordering, which will cause a memory
        copy if the given data is not C-contiguous.

    y : Ignored
        not used, present here for API consistency by convention.

    sample_weight : array-like, shape (n_samples,), optional
        The weights for each observation in X. If None, all observations
        are assigned equal weight (default: None)

    """
    if self.precompute_distances != 'deprecated':
        warnings.warn("'precompute_distances' was deprecated in version "
                      "0.23 and will be removed in 0.25. It has no "
                      "effect", FutureWarning)

    if self.n_jobs != 'deprecated':
        warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
                      " removed in 0.25.", FutureWarning)
        self._n_threads = self.n_jobs
    else:
        self._n_threads = None
    self._n_threads = _openmp_effective_n_threads(self._n_threads)

    if self.n_init <= 0:
        raise ValueError(
                f"n_init should be > 0, got {self.n_init} instead.")

    random_state = check_random_state(self.random_state)

    if self.max_iter <= 0:
        raise ValueError(
                f"max_iter should be > 0, got {self.max_iter} instead.")

    algorithm = self.algorithm
    if algorithm == "elkan" and self.n_clusters == 1:
        warnings.warn("algorithm='elkan' doesn't make sense for a single "
                      "cluster. Using 'full' instead.", RuntimeWarning)
        algorithm = "full"

    if algorithm == "auto":
        algorithm = "full" if self.n_clusters == 1 else "elkan"

    if algorithm not in ["full", "elkan"]:
        raise ValueError("Algorithm must be 'auto', 'full' or 'elkan', got"
                         " {}".format(str(algorithm)))


    daal_ready = True
    if daal_ready:
        X_len = _num_samples(X)
        daal_ready = (self.n_clusters <= X_len)
        if daal_ready and sample_weight is not None:
            sample_weight = np.asarray(sample_weight)
            daal_ready = (sample_weight.shape == (X_len,)) and (
                         np.allclose(sample_weight, np.ones_like(sample_weight)))

    if daal_ready:
        logging.info("sklearn.cluster.KMeans.fit: " + get_patch_message("daal"))
        X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32])
        self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \
            _daal4py_k_means_fit(
                X, self.n_clusters, self.max_iter, self.tol, self.init, self.n_init,
                self.verbose, random_state)
    else:
        logging.info("sklearn.cluster.KMeans.fit: " + get_patch_message("sklearn"))
        super(KMeans, self).fit(X, y=y, sample_weight=sample_weight)
    return self
예제 #6
0
    parser.add_argument('--bhtsne', action='store_true',
                        help="if set and the reference bhtsne code is "
                        "correctly installed, run it in the benchmark.")
    parser.add_argument('--all', action='store_true',
                        help="if set, run the benchmark with the whole MNIST."
                             "dataset. Note that it will take up to 1 hour.")
    parser.add_argument('--profile', action='store_true',
                        help="if set, run the benchmark with a memory "
                             "profiler.")
    parser.add_argument('--verbose', type=int, default=0)
    parser.add_argument('--pca-components', type=int, default=50,
                        help="Number of principal components for "
                             "preprocessing.")
    args = parser.parse_args()

    print("Used number of threads: {}".format(_openmp_effective_n_threads()))
    X, y = load_data(order=args.order)

    if args.pca_components > 0:
        t0 = time()
        X = PCA(n_components=args.pca_components).fit_transform(X)
        print("PCA preprocessing down to {} dimensions took {:0.3f}s"
              .format(args.pca_components, time() - t0))

    methods = []

    # Put TSNE in methods
    tsne = TSNE(n_components=2, init='pca', perplexity=args.perplexity,
                verbose=args.verbose, n_iter=1000)
    methods.append(("sklearn TSNE",
                    lambda data: tsne_fit_transform(tsne, data)))
예제 #7
0
def fit(self, X, y=None, sample_weight=None):
    """Compute k-means clustering.

    Parameters
    ----------
    X : array-like or sparse matrix, shape=(n_samples, n_features)
        Training instances to cluster. It must be noted that the data
        will be converted to C ordering, which will cause a memory
        copy if the given data is not C-contiguous.

    y : Ignored
        not used, present here for API consistency by convention.

    sample_weight : array-like, shape (n_samples,), optional
        The weights for each observation in X. If None, all observations
        are assigned equal weight (default: None)

    """
    if self.precompute_distances != 'deprecated':
        warnings.warn("'precompute_distances' was deprecated in version "
                      "0.23 and will be removed in 0.25. It has no "
                      "effect", FutureWarning)

    if self.n_jobs != 'deprecated':
        warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
                      " removed in 0.25.", FutureWarning)
        self._n_threads = self.n_jobs
    else:
        self._n_threads = None
    self._n_threads = _openmp_effective_n_threads(self._n_threads)

    if self.n_init <= 0:
        raise ValueError("Invalid number of initializations."
                         " n_init=%d must be bigger than zero." % self.n_init)

    random_state = check_random_state(self.random_state)

    if self.max_iter <= 0:
        raise ValueError('Number of iterations should be a positive number,'
                         ' got %d instead' % self.max_iter)

    # avoid forcing order when copy_x=False
    order = "C" if self.copy_x else None
    X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32],
                    order=order, copy=self.copy_x)

    algorithm = self.algorithm
    if algorithm == "elkan" and self.n_clusters == 1:
        warnings.warn("algorithm='elkan' doesn't make sense for a single "
                      "cluster. Using 'full' instead.", RuntimeWarning)
        algorithm = "full"

    if algorithm == "auto":
        algorithm = "full" if self.n_clusters == 1 else "elkan"

    if algorithm not in ["full", "elkan"]:
        raise ValueError("Algorithm must be 'auto', 'full' or 'elkan', got"
                         " {}".format(str(algorithm)))

        
    daal_ready = not sp.issparse(X)
    daal_ready = daal_ready and hasattr(X, '__array__')

    if daal_ready:
        X_len = _num_samples(X)
        daal_ready = (self.n_clusters <= X_len)
        if daal_ready and sample_weight is not None:
            sample_weight = np.asarray(sample_weight)
            daal_ready = (sample_weight.shape == (X_len,)) and (
                         np.allclose(sample_weight, np.ones_like(sample_weight)))

    if daal_ready:
        X = check_array(X, dtype=[np.float64, np.float32])
        self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \
            _daal4py_k_means_dense(
                X, self.n_clusters, self.max_iter, self.tol, self.init, self.n_init,
                random_state)
    else: 
        super(KMeans, self).fit(X, y=y, sample_weight=sample_weight)
    return self
예제 #8
0
    def _tsne(self,
              P,
              degrees_of_freedom,
              n_samples,
              X_embedded,
              neighbors=None,
              skip_num_points=0):
        """Runs t-SNE."""
        # t-SNE minimizes the Kullback-Leiber divergence of the Gaussians P
        # and the Student's t-distributions Q. The optimization algorithm that
        # we use is batch gradient descent with two stages:
        # * initial optimization with early exaggeration and momentum at 0.5
        # * final optimization with momentum at 0.8
        self.n_samples = n_samples
        params = X_embedded.ravel()

        opt_args = {
            "it": 0,
            "n_iter_check": self._N_ITER_CHECK,
            "min_grad_norm": self.min_grad_norm,
            "learning_rate": self.learning_rate,
            "verbose": self.verbose,
            "kwargs": dict(skip_num_points=skip_num_points),
            "args": [P, degrees_of_freedom, n_samples, self.n_components],
            "n_iter_without_progress": self._EXPLORATION_N_ITER,
            "n_iter": self._EXPLORATION_N_ITER,
            "momentum": 0.5,
        }
        if self.method == 'barnes_hut':
            obj_func = _kl_divergence_bh
            opt_args['kwargs']['angle'] = self.angle
            # Repeat verbose argument for _kl_divergence_bh
            opt_args['kwargs']['verbose'] = self.verbose
            # Get the number of threads for gradient computation here to
            # avoid recomputing it at each iteration.
            opt_args['kwargs']['num_threads'] = _openmp_effective_n_threads()
        else:
            obj_func = _kl_divergence

        # Learning schedule (part 1): do 250 iteration with lower momentum but
        # higher learning rate controlled via the early exaggeration parameter
        P *= self.early_exaggeration
        params, kl_divergence, it = self._gradient_descent(
            obj_func, params, **opt_args)
        if self.verbose:
            print("[t-SNE] KL divergence after %d iterations with early "
                  "exaggeration: %f" % (it + 1, kl_divergence))

        # Learning schedule (part 2): disable early exaggeration and finish
        # optimization with a higher momentum at 0.8
        P /= self.early_exaggeration
        remaining = self.n_iter - self._EXPLORATION_N_ITER
        if it < self._EXPLORATION_N_ITER or remaining > 0:
            opt_args['n_iter'] = self.n_iter
            opt_args['it'] = it + 1
            opt_args['momentum'] = 0.8
            opt_args['n_iter_without_progress'] = self.n_iter_without_progress
            params, kl_divergence, it = self._gradient_descent(
                obj_func, params, **opt_args)

        # Save the final number of iterations
        self.n_iter_ = it

        if self.verbose:
            print("[t-SNE] KL divergence after %d iterations: %f" %
                  (it + 1, kl_divergence))

        X_embedded = params.reshape(n_samples, self.n_components)
        self.kl_divergence_ = kl_divergence

        return X_embedded
예제 #9
0
Benchmark script for HistGradientBoostingClassifier.

Output written as "bench_loss_module_hgbt.parquet"
"""
from collections import OrderedDict
from neurtu import delayed, Benchmark
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils._openmp_helpers import _openmp_parallelism_enabled, _openmp_effective_n_threads

print("openmp enabled: ", _openmp_parallelism_enabled())
print("openmp threads: ", _openmp_effective_n_threads())
n_threads = _openmp_effective_n_threads()

n_samples, n_features = 100_000, 20
n_informative = int(n_features * 0.9)
bench_options = {
    "wall_time": True,
    "cpu_time": True,
    "peak_memory": True,
    "repeat": 20,
}
early_stopping = [True, False]
options = {}


def benchmark_cases(X, y):
예제 #10
0
파일: grower.py 프로젝트: Aathi410/Pro123
    def __init__(
        self,
        X_binned,
        gradients,
        hessians,
        max_leaf_nodes=None,
        max_depth=None,
        min_samples_leaf=20,
        min_gain_to_split=0.0,
        n_bins=256,
        n_bins_non_missing=None,
        has_missing_values=False,
        is_categorical=None,
        monotonic_cst=None,
        l2_regularization=0.0,
        min_hessian_to_split=1e-3,
        shrinkage=1.0,
        n_threads=None,
    ):

        self._validate_parameters(
            X_binned,
            max_leaf_nodes,
            max_depth,
            min_samples_leaf,
            min_gain_to_split,
            l2_regularization,
            min_hessian_to_split,
        )
        n_threads = _openmp_effective_n_threads(n_threads)

        if n_bins_non_missing is None:
            n_bins_non_missing = n_bins - 1

        if isinstance(n_bins_non_missing, numbers.Integral):
            n_bins_non_missing = np.array([n_bins_non_missing] *
                                          X_binned.shape[1],
                                          dtype=np.uint32)
        else:
            n_bins_non_missing = np.asarray(n_bins_non_missing,
                                            dtype=np.uint32)

        if isinstance(has_missing_values, bool):
            has_missing_values = [has_missing_values] * X_binned.shape[1]
        has_missing_values = np.asarray(has_missing_values, dtype=np.uint8)

        if monotonic_cst is None:
            self.with_monotonic_cst = False
            monotonic_cst = np.full(
                shape=X_binned.shape[1],
                fill_value=MonotonicConstraint.NO_CST,
                dtype=np.int8,
            )
        else:
            self.with_monotonic_cst = True
            monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8)

            if monotonic_cst.shape[0] != X_binned.shape[1]:
                raise ValueError(
                    "monotonic_cst has shape {} but the input data "
                    "X has {} features.".format(monotonic_cst.shape[0],
                                                X_binned.shape[1]))
            if np.any(monotonic_cst < -1) or np.any(monotonic_cst > 1):
                raise ValueError(
                    "monotonic_cst must be None or an array-like of -1, 0 or 1."
                )

        if is_categorical is None:
            is_categorical = np.zeros(shape=X_binned.shape[1], dtype=np.uint8)
        else:
            is_categorical = np.asarray(is_categorical, dtype=np.uint8)

        if np.any(
                np.logical_and(is_categorical == 1,
                               monotonic_cst != MonotonicConstraint.NO_CST)):
            raise ValueError(
                "Categorical features cannot have monotonic constraints.")

        hessians_are_constant = hessians.shape[0] == 1
        self.histogram_builder = HistogramBuilder(X_binned, n_bins, gradients,
                                                  hessians,
                                                  hessians_are_constant,
                                                  n_threads)
        missing_values_bin_idx = n_bins - 1
        self.splitter = Splitter(
            X_binned,
            n_bins_non_missing,
            missing_values_bin_idx,
            has_missing_values,
            is_categorical,
            monotonic_cst,
            l2_regularization,
            min_hessian_to_split,
            min_samples_leaf,
            min_gain_to_split,
            hessians_are_constant,
            n_threads,
        )
        self.n_bins_non_missing = n_bins_non_missing
        self.missing_values_bin_idx = missing_values_bin_idx
        self.max_leaf_nodes = max_leaf_nodes
        self.has_missing_values = has_missing_values
        self.monotonic_cst = monotonic_cst
        self.is_categorical = is_categorical
        self.l2_regularization = l2_regularization
        self.n_features = X_binned.shape[1]
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.X_binned = X_binned
        self.min_gain_to_split = min_gain_to_split
        self.shrinkage = shrinkage
        self.n_threads = n_threads
        self.splittable_nodes = []
        self.finalized_leaves = []
        self.total_find_split_time = 0.0  # time spent finding the best splits
        self.total_compute_hist_time = 0.0  # time spent computing histograms
        self.total_apply_split_time = 0.0  # time spent splitting nodes
        self.n_categorical_splits = 0
        self._intilialize_root(gradients, hessians, hessians_are_constant)
        self.n_nodes = 1
예제 #11
0
def learn_clusters(n_clust):
    client = Client(n_workers=4, processes=True)

    # 1. Learn clusters

    # Full set
    kmeans_path = 'Clustering/KMeans/n{}posts.joblib'.format(n_clust)

    array = da.from_npy_stack(npy_stack_path)
    kmeans = KMeans(n_clusters=n_clust)

    # Learn on a part of set
    # array = np.load('Clustering/npy_post_vecs_part/0.npy')
    # kmeans = SKMeans(n_clusters=n_clust)

    print('Fitting')
    kmeans.fit(array)

    del array
    # Dump centroids to the disk

    # Dump as a sklearn object, for (maybe) faster prediction and less problems
    skmeans = SKMeans(n_clusters=n_clust)
    skmeans.cluster_centers_ = kmeans.cluster_centers_
    skmeans._n_threads = _openmp_effective_n_threads()
    dump(skmeans, kmeans_path)
    del kmeans, skmeans

    # dump(kmeans, kmeans_path) # For learning on a part of set
    # del kmeans
    print('Fitted')

    # 3. Turn posts into clusters
    kmeans_path = 'Clustering/KMeans/n{}posts.joblib'.format(n_clust)

    df = dd.read_parquet('preprocessed.parquet')
    df = df.map_partitions(df_to_vector_predict,
                           kmeans_path,
                           meta={
                               'user_id': int,
                               'post_id': int,
                               'text': object,
                               'type': str,
                               'date': str,
                               'cluster': int
                           })
    df.to_parquet('Clustering/KMeans/n{}posts.parquet'.format(n_clust))
    print('Clustered')

    # 2.5. Filter outdated posts out. (The next time write date of parsing to user_info)
    # For each user find his last like and filter out likes that are older than the last + half a year
    df = dd.read_parquet('Clustering/KMeans/n{}posts.parquet'.format(n_clust))
    print('Original df len: {}'.format(len(df)))

    year = 31536000  # One year in timestamp
    kyear = 20
    break_time = kyear * year  # 0.75*year - A quarter to year
    last_like = df['date'].max().compute(
    )  # Set has been fully collected on 8 of June 2020

    df = df[df['date'] >
            last_like - break_time]  # Pass only a quarter-to-year recent likes
    print('max_date: {} '.format(df['date'].max().compute()))
    print('min date: {}'.format(df['date'].min().compute()))
    print('Filtered df len: {}'.format(len(df)))
    print('Likes has been filtered out by date')

    # 3. Group clusters by user_id and turn them into a single vector for each user

    # df = dd.read_parquet('Clustering/KMeans/n{}posts.parquet'.format(n_clust)) # INSTEAD OF FILTER!

    # - Count text_likes number for each user (and later merge with user_info)
    count = df.drop(columns=['post_id', 'type', 'date', 'cluster']).groupby(
        'user_id')['text'].count().compute()
    count.rename('text_likes', inplace=True)

    # Generate meta
    meta = {'user_id': int}
    for i in range(n_clust):
        meta[i] = float

    df = df.map_partitions(
        lambda df_part: kt.clusters_to_vector(df_part, n_clust), meta=meta)

    df.to_parquet(
        'Clustering/KMeans/n{}posts-cluster_vecs.parquet'.format(n_clust))

    # 5. Merge clusters and user_info dataframes. (Working with pandas frames)
    df_info = pd.read_csv('users_info.csv')

    df_info = df_info.merge(count, on='user_id', how='inner')
    del count

    df = pd.read_parquet(
        'Clustering/KMeans/n{}posts-cluster_vecs.parquet'.format(n_clust))

    df = df_info.merge(
        df, on='user_id', how='inner'
    )  # Merging user's info and clusters. Maybe, mistake is here

    df.to_csv('Clustering/KMeans/n{}-final_dataset-{}year.csv'.format(
        n_clust, kyear))
    print('Final dataset has been saved')
    del df_info

    # Filter some users out
    # df = pd.read_csv('Clustering/KMeans/n{}-final_dataset.csv'.format(n_clust)).drop(columns=['Unnamed: 0']) # TESTING

    df = df.loc[(df['text_likes'] > 100) & (df['text_likes'] < 1000)]

    df['bdate'] = df['bdate'].apply(
        lambda bd: time.mktime(datetime.strptime(bd, "%d.%m.%Y").timetuple()))

    # Clean up the dataset
    df = df.drop(columns=[
        'posts_n', 'text_likes', 'status', 'sex', 'smoking', 'alcohol',
        'parth_id', 'country', 'city', 'user_id'
    ]).dropna().reset_index(drop=True)

    # 6. Supervise a Linear Regression model
    regr = LinearRegression()
    R2 = train(df, regr)

    client.close()
    return R2