Python Client.has_what примеры использования

Язык программирования: Python

Пространство имен/Пакет: dask.distributed

Класс/Тип: Client

Метод/Функция: has_what

Примеров на hotexamples.com: 13

Python Client.has_what - 13 примеров найдено. Это лучшие примеры Python кода для dask.distributed.Client.has_what, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Client(30)

gather(30)

run(30)

map(30)

scatter(30)

submit(30)

shutdown(30)

close(30)

compute(30)

persist(27)

scheduler_info(24)

restart(23)

upload_file(21)

run_on_scheduler(17)

wait_for_workers(17)

ncores(16)

has_what(13)

get(10)

get_versions(9)

register_worker_callbacks(8)

register_worker_plugin(8)

cancel(7)

nthreads(6)

processing(5)

sync(5)

current(4)

who_has(4)

publish_dataset(3)

get_task_stream(3)

recreate_error_locally(2)

get_dataset(2)

write_scheduler_file(2)

rebalance(2)

retire_workers(2)

_get_scheduler_info(1)

get_registered_workers(1)

as_current(1)

futures_of(1)

get_collection(1)

tempdir_object(1)

start_ipython_workers(1)

start_workers(1)

profile(1)

replicate(1)

set_collection(1)

scheduler_status(1)

get_worker_logs(1)

list_datasets(1)

set_metadata(1)

Пример #1

Показать файл

Файл: cuml_test.py Проект: shavkunov/MLSE

def create_cuml_distributed(X_train, y_train):
    start_time = datetime.now()
    print('init dask cluster')

    cluster = LocalCUDACluster(threads_per_worker=1)
    client = Client(cluster)
    workers = client.has_what().keys()

    n_workers = len(workers)
    X_train_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train))
    y_train_cudf = cudf.Series(y_train)

    X_train_dask = dask_cudf.from_cudf(X_train_cudf, npartitions=n_workers)
    y_train_dask = dask_cudf.from_cudf(y_train_cudf, npartitions=n_workers)

    X_train_ddask, y_train_ddask = dask_utils.persist_across_workers(
        client, [X_train_dask, y_train_dask], workers=workers)
    print('cuml distributed initialized', datetime.now() - start_time)
    model = distributed_cuml_Rf(n_estimators=500, n_streams=64)
    model.fit(X_train, y_train)

    wait(model.rfs)
    print('cuml distributed finished', datetime.now() - start_time)
    client.close()
    cluster.close()
    return model

Пример #2

Показать файл

Файл: test_random_forest.py Проект: elisaoh/ECE759

def test_rf_regression_dask_fil(partitions_per_worker, cluster):

    # Use CUDA_VISIBLE_DEVICES to control the number of workers
    c = Client(cluster)

    try:

        X, y = make_regression(n_samples=10000,
                               n_features=20,
                               n_informative=10,
                               random_state=123)

        X = X.astype(np.float32)
        y = y.astype(np.float32)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=1000)

        cu_rf_params = {
            'n_estimators': 50,
            'max_depth': 16,
            'n_bins': 16,
        }

        workers = c.has_what().keys()
        n_partitions = partitions_per_worker * len(workers)

        X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train))
        X_train_df = \
            dask_cudf.from_cudf(X_cudf, npartitions=n_partitions)

        y_cudf = np.array(pd.DataFrame(y_train).values)
        y_cudf = y_cudf[:, 0]
        y_cudf = cudf.Series(y_cudf)
        y_train_df = \
            dask_cudf.from_cudf(y_cudf, npartitions=n_partitions)
        X_cudf_test = cudf.DataFrame.from_pandas(pd.DataFrame(X_test))
        X_test_df = \
            dask_cudf.from_cudf(X_cudf_test, npartitions=n_partitions)

        X_train_df, y_train_df = dask_utils.persist_across_workers(
            c, [X_train_df, y_train_df], workers=workers)

        cu_rf_mg = cuRFR_mg(**cu_rf_params)
        cu_rf_mg.fit(X_train_df, y_train_df)

        cu_rf_mg_predict = cu_rf_mg.predict(X_test_df).compute()
        cu_rf_mg_predict = cp.asnumpy(cp.array(cu_rf_mg_predict))

        acc_score = r2_score(cu_rf_mg_predict, y_test)

        assert acc_score >= 0.67

    finally:
        c.close()

Пример #3

Показать файл

def test_end_to_end(nrows, ncols, nclusters, n_parts, delayed_predict,
                    cluster):

    client = None

    try:

        client = Client(cluster)
        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs

        X_cudf, y = make_blobs(nrows,
                               ncols,
                               nclusters,
                               n_parts,
                               cluster_std=0.01,
                               verbose=False,
                               random_state=10)

        wait(X_cudf)

        cumlModel = cumlKMeans(verbose=0,
                               init="k-means||",
                               n_clusters=nclusters,
                               random_state=10)

        cumlModel.fit(X_cudf)
        cumlLabels = cumlModel.predict(X_cudf, delayed_predict)

        n_workers = len(list(client.has_what().keys()))

        # Verifying we are grouping partitions. This should be changed soon.
        if n_parts is not None and n_parts < n_workers:
            assert cumlLabels.npartitions == n_parts
        else:
            assert cumlLabels.npartitions == n_workers

        cumlPred = cp.array(cumlLabels.compute())

        assert cumlPred.shape[0] == nrows
        assert np.max(cumlPred) == nclusters - 1
        assert np.min(cumlPred) == 0

        labels = np.squeeze(y.compute().to_pandas().values)

        score = adjusted_rand_score(labels, cp.squeeze(cumlPred.get()))

        print(str(score))

        assert 1.0 == score

    finally:
        client.close()

Пример #4

Показать файл

Файл: test_kmeans.py Проект: rnyak/cuml

def test_end_to_end(nrows, ncols, nclusters, n_parts, cluster):

    client = Client(cluster)

    try:
        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs

        X_cudf, y = make_blobs(nrows,
                               ncols,
                               nclusters,
                               n_parts,
                               cluster_std=0.01,
                               verbose=True,
                               random_state=10)

        wait(X_cudf)

        cumlModel = cumlKMeans(verbose=1,
                               init="k-means||",
                               n_clusters=nclusters,
                               random_state=10)

        cumlModel.fit(X_cudf)
        cumlLabels = cumlModel.predict(X_cudf)
        n_workers = len(list(client.has_what().keys()))

        # Verifying we are grouping partitions. This should be changed soon.
        if n_parts is not None and n_parts < n_workers:
            assert cumlLabels.npartitions == n_parts
        else:
            assert cumlLabels.npartitions == n_workers

        from sklearn.metrics import adjusted_rand_score

        cumlPred = cumlLabels.compute().to_pandas().values

        assert cumlPred.shape[0] == nrows
        assert np.max(cumlPred) == nclusters - 1
        assert np.min(cumlPred) == 0

        labels = y.compute().to_pandas().values

        score = adjusted_rand_score(labels.reshape(labels.shape[0]), cumlPred)

        assert 1.0 == score

    finally:
        client.close()

Пример #5

Показать файл

def test_rf_regression(n_workers, partitions_per_worker):
    if dask_cuda.utils.get_n_gpus() < n_workers:
        pytest.skip("too few GPUs")

    cluster = LocalCUDACluster(threads_per_worker=1, n_workers=n_workers)
    c = Client(cluster)

    X, y = make_regression(n_samples=40000,
                           n_features=20,
                           n_informative=10,
                           random_state=123)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000)

    cu_rf_params = {
        'n_estimators': 25,
        'max_depth': 13,
    }

    workers = c.has_what().keys()
    n_partitions = partitions_per_worker * len(workers)

    X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train))
    X_train_df = \
        dask_cudf.from_cudf(X_cudf, npartitions=n_partitions)

    y_cudf = np.array(pd.DataFrame(y_train).values)
    y_cudf = y_cudf[:, 0]
    y_cudf = cudf.Series(y_cudf)
    y_train_df = \
        dask_cudf.from_cudf(y_cudf, npartitions=n_partitions)

    X_train_df, y_train_df = dask_utils.persist_across_workers(
        c, [X_train_df, y_train_df], workers=workers)

    cu_rf_mg = cuRFR_mg(**cu_rf_params)
    cu_rf_mg.fit(X_train_df, y_train_df)
    cu_rf_mg_predict = cu_rf_mg.predict(X_test)

    acc_score = r2_score(cu_rf_mg_predict, y_test)

    print(str(acc_score))

    assert acc_score >= 0.70

    c.close()
    cluster.close()

Пример #6

Показать файл

Файл: test_linear_regression.py Проект: trxcllnt/cuml

def test_ols(cluster):

    client = Client(cluster)

    try:

        import dask_cudf

        import cudf
        import numpy as np

        from cuml.dask.linear_model import LinearRegression as cumlOLS_dask

        nrows = 2**8
        ncols = 399

        X, y = load_data(nrows, ncols)

        X_cudf = cudf.DataFrame.from_pandas(X)
        y_cudf = np.array(y.as_matrix())
        y_cudf = y_cudf[:, 0]
        y_cudf = cudf.Series(y_cudf)

        workers = client.has_what().keys()

        X_df = dask_cudf.from_cudf(X_cudf, npartitions=len(workers)).persist()
        y_df = dask_cudf.from_cudf(y_cudf, npartitions=len(workers)).persist()

        lr = cumlOLS_dask()

        lr.fit(X_df, y_df)

        ret = lr.predict(X_df)

        error_cuml = mean_squared_error(y, ret.compute().to_array())

        assert(error_cuml < 1e-6)

    finally:
        client.close()
        cluster.close()

Пример #7

Показать файл

Файл: rapids_csp_azure.py Проект: vaidya-s/azureml-examples

class RapidsCloudML(object):
    def __init__(
        self,
        cloud_type="Azure",
        model_type="RandomForest",
        data_type="Parquet",
        compute_type="single-GPU",
        verbose_estimator=False,
        CSP_paths=default_azureml_paths,
    ):

        self.CSP_paths = CSP_paths
        self.cloud_type = cloud_type
        self.model_type = model_type
        self.data_type = data_type
        self.compute_type = compute_type
        self.verbose_estimator = verbose_estimator
        self.log_to_file(
            f"\n> RapidsCloudML\n\tCompute, Data , Model, Cloud types {self.compute_type, self.data_type, self.model_type, self.cloud_type}"
        )

        # Setting up client for multi-GPU option
        if "multi" in self.compute_type:
            self.log_to_file("\n\tMulti-GPU selected")
            # This will use all GPUs on the local host by default
            cluster = LocalCUDACluster(threads_per_worker=1)
            self.client = Client(cluster)

            # Query the client for all connected workers
            self.workers = self.client.has_what().keys()
            self.n_workers = len(self.workers)
            self.log_to_file(f"\n\tClient information {self.client}")

    def load_hyperparams(self, model_name="XGBoost"):
        """
        Selecting model paramters based on the model we select for execution.
        Checks if there is a config file present in the path self.CSP_paths['hyperparams'] with
        the parameters for the experiment. If not present, it returns the default parameters.

        Parameters
        ----------
        model_name : string
                     Selects which model to set the parameters for. Takes either 'XGBoost' or 'RandomForest'.

        Returns
        ----------
        model_params : dict
                       Loaded model parameters (dict)
        """

        self.log_to_file("\n> Loading Hyperparameters")

        # Default parameters of the models
        if self.model_type == "XGBoost":
            # https://xgboost.readthedocs.io/en/latest/parameter.html
            model_params = {
                "max_depth": 6,
                "num_boost_round": 100,
                "learning_rate": 0.3,
                "gamma": 0.0,
                "lambda": 1.0,
                "alpha": 0.0,
                "objective": "binary:logistic",
                "random_state": 0,
            }

        elif self.model_type == "RandomForest":
            # https://docs.rapids.ai/api/cuml/stable/  -> cuml.ensemble.RandomForestClassifier
            model_params = {
                "n_estimators": 10,
                "max_depth": 10,
                "n_bins": 16,
                "max_features": 1.0,
                "seed": 0,
            }

        hyperparameters = {}
        try:
            with open(self.CSP_paths["hyperparams"], "r") as file_handle:
                hyperparameters = json.load(file_handle)
                for key, value in hyperparameters.items():
                    model_params[key] = value
                pprint.pprint(model_params)
                return model_params

        except Exception as error:
            self.log_to_file(str(error))
            return

    def load_data(self,
                  filename="dataset.orc",
                  col_labels=None,
                  y_label="ArrDelayBinary"):
        """
        Loading the data into the object from the filename and based on the columns that we are
        interested in. Also, generates y_label from 'ArrDelay' column to convert this into a binary
        classification problem.

        Parameters
        ----------
        filename : string
                   the path of the dataset to be loaded

        col_labels : list of strings
                     The input columns that we are interested in. None selects all the columns

        y_label : string
                  The column to perform the prediction task in.

        Returns
        ----------
        dataset : dataframe (Pandas, cudf or dask-cudf)
                  Ingested dataset in the format of a dataframe

        col_labels : list of strings
                     The input columns selected

        y_label : string
                  The generated y_label name for binary classification

        duration : float
                   The time it took to execute the function
        """
        target_filename = filename
        self.log_to_file(f"\n> Loading dataset from {target_filename}")

        with PerfTimer() as ingestion_timer:
            if "CPU" in self.compute_type:
                # CPU Reading options
                self.log_to_file(f"\n\tCPU read")

                if self.data_type == "ORC":
                    with open(target_filename, mode="rb") as file:
                        dataset = pyarrow_orc.ORCFile(file).read().to_pandas()
                elif self.data_type == "CSV":
                    dataset = pd.read_csv(target_filename, names=col_labels)

                elif self.data_type == "Parquet":

                    if "single" in self.compute_type:
                        dataset = pd.read_parquet(target_filename)

                    elif "multi" in self.compute_type:
                        self.log_to_file(f"\n\tReading using dask dataframe")
                        dataset = dask.dataframe.read_parquet(target_filename,
                                                              columns=columns)

            elif "GPU" in self.compute_type:
                # GPU Reading Option

                self.log_to_file(f"\n\tGPU read")
                if self.data_type == "ORC":
                    dataset = cudf.read_orc(target_filename)

                elif self.data_type == "CSV":
                    dataset = cudf.read_csv(target_filename, names=col_labels)

                elif self.data_type == "Parquet":

                    if "single" in self.compute_type:
                        dataset = cudf.read_parquet(target_filename)

                    elif "multi" in self.compute_type:
                        self.log_to_file(f"\n\tReading using dask_cudf")
                        dataset = dask_cudf.read_parquet(target_filename,
                                                         columns=col_labels)

        # cast all columns to float32
        for col in dataset.columns:
            dataset[col] = dataset[col].astype(
                np.float32)  # needed for random forest

        # Adding y_label column if it is not present
        if y_label not in dataset.columns:
            dataset[y_label] = 1.0 * (dataset["ArrDelay"] > 10)

        dataset[y_label] = dataset[y_label].astype(
            np.int32)  # Needed for cuml RF

        dataset = dataset.fillna(
            0.0)  # Filling the null values. Needed for dask-cudf

        self.log_to_file(
            f"\n\tIngestion completed in {ingestion_timer.duration}")
        self.log_to_file(
            f"\n\tDataset descriptors: {dataset.shape}\n\t{dataset.dtypes}")
        return dataset, col_labels, y_label, ingestion_timer.duration

    def split_data(self,
                   dataset,
                   y_label,
                   train_size=0.8,
                   random_state=0,
                   shuffle=True):
        """
        Splitting data into train and test split, has appropriate imports for different compute modes.
        CPU compute - Uses sklearn, we manually filter y_label column in the split call
        GPU Compute - Single GPU uses cuml and multi GPU uses dask, both split y_label internally.

        Parameters
        ----------
        dataset : dataframe
                  The dataframe on which we wish to perform the split
        y_label : string
                  The name of the column (not the series itself)
        train_size : float
                     The size for the split. Takes values between 0 to 1.
        random_state : int
                       Useful for running reproducible splits.
        shuffle : binary
                  Specifies if the data must be shuffled before splitting.

        Returns
        ----------
        X_train : dataframe
                  The data to be used for training. Has same type as input dataset.
        X_test : dataframe
                  The data to be used for testing. Has same type as input dataset.
        y_train : dataframe
                  The label to be used for training. Has same type as input dataset.
        y_test : dataframe
                  The label to be used for testing. Has same type as input dataset.
        duration : float
                   The time it took to perform the split
        """
        self.log_to_file("\n> Splitting train and test data")
        start_time = time.perf_counter()

        with PerfTimer() as split_timer:
            if "CPU" in self.compute_type:
                X_train, X_test, y_train, y_test = sklearn_train_test_split(
                    dataset.loc[:, dataset.columns != y_label],
                    dataset[y_label],
                    train_size=train_size,
                    shuffle=shuffle,
                    random_state=random_state,
                )

            elif "GPU" in self.compute_type:
                if "single" in self.compute_type:
                    X_train, X_test, y_train, y_test = cuml_train_test_split(
                        X=dataset,
                        y=y_label,
                        train_size=train_size,
                        shuffle=shuffle,
                        random_state=random_state,
                    )
                elif "multi" in self.compute_type:
                    X_train, X_test, y_train, y_test = dask_train_test_split(
                        dataset,
                        y_label,
                        train_size=train_size,
                        shuffle=False,  # shuffle not available for dask_cudf yet
                        random_state=random_state,
                    )

        self.log_to_file(
            f"\n\tX_train shape and type{X_train.shape} {type(X_train)}")
        self.log_to_file(f"\n\tSplit completed in {split_timer.duration}")
        return X_train, X_test, y_train, y_test, split_timer.duration

    def train_model(self, X_train, y_train, model_params):
        """
        Trains a model with the model_params specified by calling fit_xgboost or
        fit_random_forest depending on the model_type.

        Parameters
        ----------
        X_train : dataframe
                  The data for traning
        y_train : dataframe
                  The label to be used for training.
        model_params : dict
                       The model params to use for this training
        Returns
        ----------
        trained_model : The object of the trained model either of XGBoost or RandomForest

        training_time : float
                        The time it took to train the model
        """
        self.log_to_file(
            f"\n> Training {self.model_type} estimator w/ hyper-params")
        training_time = 0

        try:
            if self.model_type == "XGBoost":
                trained_model, training_time = self.fit_xgboost(
                    X_train, y_train, model_params)
            elif self.model_type == "RandomForest":
                trained_model, training_time = self.fit_random_forest(
                    X_train, y_train, model_params)
        except Exception as error:
            self.log_to_file("\n\n!error during model training: " + str(error))
        self.log_to_file(f"\n\tFinished training in {training_time:.4f} s")
        return trained_model, training_time

    def fit_xgboost(self, X_train, y_train, model_params):
        """
        Trains a XGBoost model on X_train and y_train with model_params

        Parameters and Objects returned are same as trained_model
        """
        if "GPU" in self.compute_type:
            model_params.update({"tree_method": "gpu_hist"})
        else:
            model_params.update({"tree_method": "hist"})

        with PerfTimer() as train_timer:
            if "single" in self.compute_type:
                train_DMatrix = xgboost.DMatrix(data=X_train, label=y_train)
                trained_model = xgboost.train(
                    dtrain=train_DMatrix,
                    params=model_params,
                    num_boost_round=model_params["num_boost_round"],
                )
            elif "multi" in self.compute_type:
                self.log_to_file("\n\tTraining multi-GPU XGBoost")
                train_DMatrix = xgboost.dask.DaskDMatrix(self.client,
                                                         data=X_train,
                                                         label=y_train)
                trained_model = xgboost.dask.train(
                    self.client,
                    dtrain=train_DMatrix,
                    params=model_params,
                    num_boost_round=model_params["num_boost_round"],
                )
        return trained_model, train_timer.duration

    def fit_random_forest(self, X_train, y_train, model_params):
        """
        Trains a RandomForest model on X_train and y_train with model_params.
        Depending on compute_type, estimators from appropriate packages are used.
        CPU - sklearn
        Single-GPU - cuml
        multi_gpu - cuml.dask

        Parameters and Objects returned are same as trained_model
        """
        if "CPU" in self.compute_type:
            rf_model = sklearn.ensemble.RandomForestClassifier(
                n_estimators=model_params["n_estimators"],
                max_depth=model_params["max_depth"],
                max_features=model_params["max_features"],
                n_jobs=int(self.n_workers),
                verbose=self.verbose_estimator,
            )
        elif "GPU" in self.compute_type:
            if "single" in self.compute_type:
                rf_model = cuml.ensemble.RandomForestClassifier(
                    n_estimators=model_params["n_estimators"],
                    max_depth=model_params["max_depth"],
                    n_bins=model_params["n_bins"],
                    max_features=model_params["max_features"],
                    verbose=self.verbose_estimator,
                )
            elif "multi" in self.compute_type:
                self.log_to_file("\n\tFitting multi-GPU daskRF")
                X_train, y_train = dask_utils.persist_across_workers(
                    self.client,
                    [X_train.fillna(0.0),
                     y_train.fillna(0.0)],
                    workers=self.workers,
                )
                rf_model = cuml.dask.ensemble.RandomForestClassifier(
                    n_estimators=model_params["n_estimators"],
                    max_depth=model_params["max_depth"],
                    n_bins=model_params["n_bins"],
                    max_features=model_params["max_features"],
                    verbose=self.verbose_estimator,
                )
        with PerfTimer() as train_timer:
            try:
                trained_model = rf_model.fit(X_train, y_train)
            except Exception as error:
                self.log_to_file("\n\n! Error during fit " + str(error))
        return trained_model, train_timer.duration

    def evaluate_test_perf(self, trained_model, X_test, y_test, threshold=0.5):
        """
        Evaluates the model performance on the inference set. For XGBoost we need
        to generate a DMatrix and then we can evaluate the model.
        For Random Forest, in single GPU case, we can just call .score function.
        And multi-GPU Random Forest needs to predict on the model and then compute
        the accuracy score.

        Parameters
        ----------
        trained_model : The object of the trained model either of XGBoost or RandomForest
        X_test : dataframe
                  The data for testing
        y_test : dataframe
                  The label to be used for testing.
        Returns
        ----------
        test_accuracy : float
                        The accuracy achieved on test set
        duration : float
                   The time it took to evaluate the model
        """
        self.log_to_file(f"\n> Inferencing on test set")
        test_accuracy = None
        with PerfTimer() as inference_timer:
            try:
                if self.model_type == "XGBoost":
                    if "multi" in self.compute_type:
                        test_DMatrix = xgboost.dask.DaskDMatrix(self.client,
                                                                data=X_test,
                                                                label=y_test)
                        xgb_pred = xgboost.dask.predict(
                            self.client, trained_model,
                            test_DMatrix).compute()
                        xgb_pred = (xgb_pred > threshold) * 1.0
                        test_accuracy = accuracy_score(y_test.compute(),
                                                       xgb_pred)
                    elif "single" in self.compute_type:
                        test_DMatrix = xgboost.DMatrix(data=X_test,
                                                       label=y_test)
                        xgb_pred = trained_model.predict(test_DMatrix)
                        xgb_pred = (xgb_pred > threshold) * 1.0
                        test_accuracy = accuracy_score(y_test, xgb_pred)

                elif self.model_type == "RandomForest":
                    if "multi" in self.compute_type:
                        cuml_pred = trained_model.predict(X_test).compute()
                        self.log_to_file("\n\tPrediction complete")
                        test_accuracy = accuracy_score(y_test.compute(),
                                                       cuml_pred,
                                                       convert_dtype=True)
                    elif "single" in self.compute_type:
                        test_accuracy = trained_model.score(
                            X_test, y_test.astype("int32"))

            except Exception as error:
                self.log_to_file("\n\n!error during inference: " + str(error))

        self.log_to_file(
            f"\n\tFinished inference in {inference_timer.duration:.4f} s")
        self.log_to_file(f"\n\tTest-accuracy: {test_accuracy}")
        return test_accuracy, inference_timer.duration

    def set_up_logging(self):
        """
        Function to set up logging for the object.
        """
        logging_path = self.CSP_paths["output"] + "/log.txt"
        logging.basicConfig(filename=logging_path, level=logging.INFO)

    def log_to_file(self, text):
        """
        Logs the text that comes in as input.
        """
        logging.info(text)
        print(text)

Пример #8

Показать файл

Файл: xgb-dask.py Проект: valeman/GBM-perf

d_train = pd.read_csv("https://raw.githubusercontent.com/szilard/benchm-ml--data/master/int_enc/train-1m-intenc.csv")
d_test = pd.read_csv("https://raw.githubusercontent.com/szilard/benchm-ml--data/master/int_enc/test-1m-intenc.csv")

dx_train = dd.from_pandas(d_train, npartitions=16)
dx_test = dd.from_pandas(d_test, npartitions=1)

X_train = dx_train.iloc[:, :-1].to_dask_array(lengths=True)
y_train = dx_train.iloc[:,-1:].to_dask_array(lengths=True)
X_test = dx_test.iloc[:, :-1].to_dask_array(lengths=True)
y_test = dx_test.iloc[:,-1:].to_dask_array(lengths=True)

X_train.persist()
y_train.persist()

client.has_what()


dxgb_train = xgb.dask.DaskDMatrix(client, X_train, y_train)
dxgb_test = xgb.dask.DaskDMatrix(client, X_test)


param = {'objective':'binary:logistic', 'tree_method':'hist', 'max_depth':10, 'eta':0.1}             
%time md = xgb.dask.train(client, param, dxgb_train, num_boost_round = 100)


y_pred = xgb.dask.predict(client, md, dxgb_test)
y_pred_loc = y_pred.compute()
y_test_loc = y_test.compute()
print(metrics.roc_auc_score(y_test_loc, y_pred_loc))

Пример #9

Показать файл

Файл: sampler.py Проект: stsievert/salmon

    def run(self, client: DaskClient):
        """
        Run the algorithm.

        Parameters
        ----------
        client : DaskClient
            A client to Dask.
        rj : RedisClient
            A Redist Client, a rejson.Client

        Notes
        -----
        This function runs the adaptive algorithm. Because it's asynchronous,
        this function should return if
        ``"reset" in rj.keys() and rj.jsonget("reset")``.

        """
        rj = self.redis_client()

        answers: List = []
        logger.info(f"Staring {self.ident}")

        def submit(fn: str, *args, allow_other_workers=True, **kwargs):
            if "workers" in kwargs:
                kwargs.update({"allow_other_workers": allow_other_workers})
            return client.submit(
                getattr(type(self), fn),
                *args,
                **kwargs,
            )

        update = False
        queries = np.array([])
        scores = np.array([])
        n_model_updates = 0
        rj.jsonset(f"alg-perf-{self.ident}", root, [])
        save_deadline = 0.0  # right away
        data: List[Dict[str, Any]] = []

        error_raised: List[int] = []
        for k in itertools.count():
            try:
                loop_start = time()
                datum = {"iteration": k, "ident": self.ident, "time": time()}

                answers = self.get_answers(rj, clear=True)
                datum["num_answers"] = len(answers)
                self_future = client.scatter(self)

                _start = time()
                if len(queries) and len(scores):
                    queries_f = client.scatter(queries)
                    scores_f = client.scatter(scores)
                else:
                    queries_f = scores_f = []
                if update:
                    datum["cleared_queries"] = True
                    __start = time()
                    self.clear_queries(rj)
                    datum["time_clearing"] = time() - __start
                else:
                    datum["cleared_queries"] = False
                done = distributed.Event(name="pa_finished")
                done.clear()

                workers = list(client.has_what())
                random.shuffle(workers)
                f_post = submit(
                    "post_queries",
                    self_future,
                    queries_f,
                    scores_f,
                    done=done,
                    workers=workers[0],
                )
                f_model = submit(
                    "process_answers",
                    self_future,
                    answers,
                    workers=workers[1],
                )

                f_search = submit(
                    "get_queries",
                    self_future,
                    stop=done,
                    workers=workers[2],
                )

                time_model = 0.0
                time_post = 0.0
                time_search = 0.0

                def _model_done(_):
                    nonlocal time_model
                    nonlocal done
                    done.set()
                    time_model += time() - _start

                def _post_done(_):
                    nonlocal time_post
                    time_post += time() - _start

                def _search_done(_):
                    nonlocal time_search
                    time_search += time() - _start

                f_model.add_done_callback(_model_done)
                f_post.add_done_callback(_post_done)
                f_search.add_done_callback(_search_done)

                # Future.result raises errors automatically
                posted = f_post.result()
                new_self, update = f_model.result()
                queries, scores, search_meta = f_search.result()

                _datum_update = {
                    "n_queries_posted": posted,
                    "n_queries_scored": len(queries),
                    "n_queries_in_db": rj.zcard(f"alg-{self.ident}-queries"),
                    "model_updated": update,
                    "n_model_updates": n_model_updates,
                    "time_posting_queries": time_post,
                    "time_model_update": time_model,
                    "time_search": time_search,
                    "time": time(),
                    **search_meta,
                }
                datum.update(_datum_update)
                if update:
                    _s = time()
                    self.__dict__.update(new_self.__dict__)
                    datum["time_update"] = time() - _s
                    n_model_updates += 1

                if time() > save_deadline + 1e-3:
                    save_deadline = time() + 60
                    _s = time()
                    self.save()
                    datum["time_save"] = time() - _s
                datum["time_loop"] = time() - loop_start

                data.append(datum)
                logger.info(datum)
                posting_deadline = data[0]["time"] + 2 * 60
                if time() >= posting_deadline or k == 10 or k == 20:
                    flush_logger(logger)
                    keys = data[-1].keys()
                    to_post = {}
                    for _k in keys:
                        vals = [d.get(_k, None) for d in data]
                        vals = [v for v in vals if v]
                        if not len(vals):
                            continue
                        if isinstance(vals[0], (int, np.integer)):
                            Type = int
                        elif isinstance(vals[0], (float, np.floating)):
                            Type = float
                        else:
                            continue
                        _update = {
                            f"{_k}_median": np.median(vals),
                            f"{_k}_mean": np.mean(vals),
                            f"{_k}_min": np.min(vals),
                            f"{_k}_max": np.max(vals),
                        }
                        if _k == "time":
                            _update = {"time": _update["time_median"]}
                        to_post.update(
                            {_k: Type(v)
                             for _k, v in _update.items()})

                    try:
                        rj.jsonarrappend(f"alg-perf-{self.ident}", root,
                                         to_post)
                    except ResponseError as e:
                        if ("could not perform this operation on a key that doesn't exist"
                                in str(e)):
                            # I think this happens when the frontend deletes
                            # the database when /reset is triggered
                            pass
                        else:
                            raise e

                    data = []

                if "reset" in rj.keys() and rj.jsonget("reset", root):
                    logger.warning(f"Resetting {self.ident}")
                    self.reset(client, rj, futures=[f_model, f_post, f_search])
                    break

            except Exception as e:
                logger.exception(e)
                flush_logger(logger)
                error_raised.append(k)

                __n = 5
                if np.diff(error_raised[-__n:]).tolist() == [1] * (__n - 1):
                    logger.exception(e)
                    flush_logger(logger)
                    raise e
        return True

Пример #10

Показать файл

    X_dask, y_dask = \
      dask_utils.persist_across_workers(c, [X_dask, y_dask], workers=workers)
    
    return X_dask, y_dask


if __name__ == "__main__":
    ## using dask to setup cluster

    # This will use all GPUs on the local host by default
    # set this to use on node disk for caching
    cluster = LocalCUDACluster(threads_per_worker=1)
    c = Client(cluster)

    # Query the client for all connected workers
    workers = c.has_what().keys()
    n_workers = len(workers)
    n_streams = 8 # Performance optimization

    ## setting parameters

    # Data parameters
    train_size = 100000
    test_size = 1000
    n_samples = train_size + test_size
    n_features = 20

    # Random Forest building parameters
    max_depth = 12
    n_bins = 16
    n_trees = 1000

Пример #11

Показать файл

class RapidsCloudML(object):
    def __init__(self,
                 model_type='RandomForest',
                 compute_type='multi-GPU',
                 CSP_paths=default_sagemaker_paths):

        self.CSP_paths = CSP_paths
        self.model_type = model_type
        self.compute_type = compute_type

        # CPU or GPU cluster
        if 'multi-GPU' in self.compute_type:
            self.n_workers = cupy.cuda.runtime.getDeviceCount()
            self.cluster = LocalCUDACluster(n_workers=self.n_workers)
            self.client = Client(self.cluster)
            print(f'dask multi-GPU cluster with {self.n_workers} workers ')

        elif 'multi-CPU' in self.compute_type:
            self.n_workers = os.cpu_count()
            self.cluster = LocalCluster(n_workers=self.n_workers,
                                        threads_per_worker=1)
            self.client = Client(self.cluster)
            print(f'dask multi-CPU cluster with {self.n_workers} workers')
        else:
            self.cluster = None
            self.client = None

    def load_data(self, filename='*.parquet', columns=None):

        target_filename = self.CSP_paths['train_data'] + '/' + filename
        self.log(f'\n> loading dataset from {target_filename}...\n')

        with PerfTimer(self, 'ingestion_timer'):
            if 'multi-CPU' in self.compute_type:
                dataset = dask.dataframe.read_parquet(target_filename,
                                                      columns=columns)

            elif 'multi-GPU' in self.compute_type:
                dataset = dask_cudf.read_parquet(target_filename,
                                                 columns=columns)

            dataset = dataset.dropna()
            dataset = dataset.repartition(npartitions=self.n_workers * 4)

        print(f'dataset len : {len(dataset)}')
        return dataset

    def split_data(self,
                   dataset,
                   y_label,
                   train_size=.8,
                   random_state=0,
                   shuffle=True):

        with PerfTimer(self, 'split_timer'):
            train, test = train_test_split(
                dataset, random_state=random_state
            )  # unable to shuffle -- no dask_cudf sampler implemented

            X_train, y_train = train.drop(
                y_label,
                axis=1).astype('float32'), train[y_label].astype('int32')
            X_test, y_test = test.drop(
                y_label,
                axis=1).astype('float32'), test[y_label].astype('int32')

        if 'multi-GPU' in self.compute_type:
            with PerfTimer(self, 'persist_timer'):
                workers = self.client.has_what().keys()
                X_train, X_test, y_train, y_test = persist_across_workers(
                    self.client, [X_train, X_test, y_train, y_test],
                    workers=workers)
                wait([X_train, X_test, y_train, y_test])

        return X_train, X_test, y_train, y_test

    def train_model(self, X_train, y_train, model_params):

        with PerfTimer(self, 'train_timer'):

            if 'XGBoost' in self.model_type:
                dtrain = xgboost.dask.DaskDMatrix(self.client, X_train,
                                                  y_train)

                # avoids warning messages
                boosting_rounds = model_params.pop('num_boost_round')

                trained_model = xgboost.dask.train(
                    self.client,
                    model_params,
                    dtrain,
                    num_boost_round=boosting_rounds)
                return trained_model['booster']

            elif 'RandomForest' in self.model_type:
                if 'GPU' in self.compute_type:
                    from cuml.dask.ensemble import RandomForestClassifier
                    rf_model = RandomForestClassifier(
                        n_estimators=model_params['n_estimators'],
                        max_depth=model_params['max_depth'],
                        max_features=model_params['max_features'],
                        n_bins=32)
                else:
                    from sklearn.ensemble import RandomForestClassifier
                    rf_model = RandomForestClassifier(
                        n_estimators=model_params['n_estimators'],
                        max_depth=model_params['max_depth'],
                        max_features=model_params['max_features'],
                        n_jobs=-1)

                trained_model = rf_model.fit(X_train, y_train)
                return trained_model
            print(len(X_train))
        return None

    def evaluate_test_perf(self, trained_model, X_test, y_test, threshold=0.5):
        with PerfTimer(self, 'score_timer'):

            if 'XGBoost' in self.model_type:
                dtest = xgboost.dask.DaskDMatrix(self.client, X_test, y_test)
                predictions = xgboost.dask.predict(self.client, trained_model,
                                                   dtest).compute()
                predictions = np.where(
                    predictions >= threshold, 1,
                    0)  # threshold returned probabilities into 0/1 labels

            elif 'RandomForest' in self.model_type:
                predictions = trained_model.predict(X_test)
                if 'multi-CPU' not in self.compute_type:
                    predictions = predictions.compute()

            if 'multi' in self.compute_type:
                y_test = y_test.compute()

            if 'GPU' in self.compute_type:
                test_accuracy = cuml_accuracy_score(y_test, predictions)
            elif 'CPU' in self.compute_type:
                test_accuracy = sklearn_accuracy_score(y_test, predictions)

        # accumulate internal list
        return test_accuracy

    # emit score so sagemaker can parse it (using string REGEX)
    def emit_score(self, test_accuracy):
        self.log(f'\n\t test-accuracy: {test_accuracy}; \n')

    def save_best_model(self, global_best_model=None):
        pass

    def set_up_logging(self):
        logging_path = self.CSP_paths['output'] + '/log.txt'
        logging.basicConfig(filename=logging_path, level=logging.INFO)

    def log(self, text):
        logging.info(text)
        print(text)

Пример #12

Показать файл

Файл: test_nearest_neighbors.py Проект: raydouglass/dask-cuml

def test_end_to_end():

    cluster = LocalCUDACluster(threads_per_worker=1)
    client = Client(cluster)

    # NOTE: The LocalCUDACluster needs to be started before any imports that
    # could potentially create a CUDA context.

    import dask_cudf

    import cudf
    import numpy as np

    from dask_cuml.neighbors import NearestNeighbors as cumlKNN

    def create_df(f, m, n):
        X = np.random.rand(m, n)
        ret = cudf.DataFrame(
            [(i, X[:, i].astype(np.float32)) for i in range(n)],
            index=cudf.dataframe.RangeIndex(f * m, f * m + m, 1))
        return ret

    def get_meta(df):
        ret = df.iloc[:0]
        return ret

    # Per gpu/worker
    train_m = 500
    train_n = 25

    search_m = 10
    search_k = 15

    workers = client.has_what().keys()

    # Create dfs on each worker (gpu)
    dfs = [
        client.submit(create_df, n, train_m, train_n, workers=[worker])
        for worker, n in list(zip(workers, list(range(len(workers)))))
    ]

    # Wait for completion
    wait(dfs)

    meta = client.submit(get_meta, dfs[0]).result()

    X_df = dask_cudf.from_delayed(dfs, meta=meta)
    X_pd = X_df.compute().to_pandas()

    cumlNN = cumlKNN()
    cumlNN.fit(X_df)

    sklNN = NearestNeighbors(metric="sqeuclidean")
    sklNN.fit(X_pd)

    cuml_D, cuml_I = cumlNN.kneighbors(X_df[0:search_m - 1], search_k)
    sk_D, sk_I = sklNN.kneighbors(X_pd[0:search_m], search_k)

    cuml_I_nd = np.array(cuml_I.compute().as_gpu_matrix(), dtype=sk_I.dtype)
    cuml_D_nd = np.array(cuml_D.compute().as_gpu_matrix(), dtype=sk_D.dtype)

    print(str(cuml_D_nd.dtype))
    print(str(sk_D.dtype))

    assert np.array_equal(cuml_I_nd, sk_I)
    assert np.allclose(cuml_D_nd, sk_D, atol=1e-5)

    cluster.close()

Пример #13

Показать файл

def test_end_to_end(nrows, ncols, nclusters, n_parts,
                    delayed_predict, input_type, cluster):

    client = None

    try:

        client = Client(cluster)
        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs

        X, y = make_blobs(n_samples=int(nrows),
                          n_features=ncols,
                          centers=nclusters,
                          n_parts=n_parts,
                          cluster_std=0.01,
                          random_state=10)

        wait(X)
        if input_type == "dataframe":
            X_train = to_dask_cudf(X)
            y_train = to_dask_cudf(y)
        elif input_type == "array":
            X_train, y_train = X, y

        cumlModel = cumlKMeans(init="k-means||",
                               n_clusters=nclusters,
                               random_state=10)

        cumlModel.fit(X_train)
        cumlLabels = cumlModel.predict(X_train, delayed_predict)

        n_workers = len(list(client.has_what().keys()))

        # Verifying we are grouping partitions. This should be changed soon.
        if n_parts is not None and n_parts < n_workers:
            parts_len = n_parts
        else:
            parts_len = n_workers

        if input_type == "dataframe":
            assert cumlLabels.npartitions == parts_len
            cumlPred = cp.array(cumlLabels.compute().to_pandas().values)
            labels = cp.squeeze(y_train.compute().to_pandas().values)
        elif input_type == "array":
            assert len(cumlLabels.chunks[0]) == parts_len
            cumlPred = cp.array(cumlLabels.compute())
            labels = cp.squeeze(y_train.compute())

        assert cumlPred.shape[0] == nrows
        assert cp.max(cumlPred) == nclusters - 1
        assert cp.min(cumlPred) == 0

        score = adjusted_rand_score(labels, cumlPred)

        print(str(score))

        assert 1.0 == score

    finally:
        client.close()