示例#1
0
    def split_data(self,
                   dataset,
                   y_label,
                   train_size=.8,
                   random_state=0,
                   shuffle=True):

        with PerfTimer(self, 'split_timer'):
            train, test = train_test_split(
                dataset, random_state=random_state
            )  # unable to shuffle -- no dask_cudf sampler implemented

            X_train, y_train = train.drop(
                y_label,
                axis=1).astype('float32'), train[y_label].astype('int32')
            X_test, y_test = test.drop(
                y_label,
                axis=1).astype('float32'), test[y_label].astype('int32')

        if 'multi-GPU' in self.compute_type:
            with PerfTimer(self, 'persist_timer'):
                workers = self.client.has_what().keys()
                X_train, X_test, y_train, y_test = persist_across_workers(
                    self.client, [X_train, X_test, y_train, y_test],
                    workers=workers)
                wait([X_train, X_test, y_train, y_test])

        return X_train, X_test, y_train, y_test
示例#2
0
def create_cuml_distributed(X_train, y_train):
    start_time = datetime.now()
    print('init dask cluster')

    cluster = LocalCUDACluster(threads_per_worker=1)
    client = Client(cluster)
    workers = client.has_what().keys()

    n_workers = len(workers)
    X_train_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train))
    y_train_cudf = cudf.Series(y_train)

    X_train_dask = dask_cudf.from_cudf(X_train_cudf, npartitions=n_workers)
    y_train_dask = dask_cudf.from_cudf(y_train_cudf, npartitions=n_workers)

    X_train_ddask, y_train_ddask = dask_utils.persist_across_workers(
        client, [X_train_dask, y_train_dask], workers=workers)
    print('cuml distributed initialized', datetime.now() - start_time)
    model = distributed_cuml_Rf(n_estimators=500, n_streams=64)
    model.fit(X_train, y_train)

    wait(model.rfs)
    print('cuml distributed finished', datetime.now() - start_time)
    client.close()
    cluster.close()
    return model
    def split_dataset(self, dataset, random_state):
        """
        Split dataset into train and test data subsets,
        currently using CV-fold index for randomness.
        Plan to refactor with dask_ml KFold
        """
        hpo_log.info('> train-test split')
        label_column = self.hpo_config.label_column

        train, test = train_test_split(dataset, random_state=random_state)

        # build X [ features ], y [ labels ] for the train and test subsets
        y_train = train[label_column]
        X_train = train.drop(label_column, axis=1)
        y_test = test[label_column]
        X_test = test.drop(label_column, axis=1)

        # force execution
        X_train, y_train, X_test, y_test = persist_across_workers(
            self.client, [X_train, y_train, X_test, y_test],
            workers=self.client.has_what().keys())

        # wait!
        wait([X_train, y_train, X_test, y_test])

        return (X_train.astype(self.hpo_config.dataset_dtype),
                X_test.astype(self.hpo_config.dataset_dtype),
                y_train.astype(self.hpo_config.dataset_dtype),
                y_test.astype(self.hpo_config.dataset_dtype))
示例#4
0
def test_rf_regression_dask_fil(partitions_per_worker, cluster):

    # Use CUDA_VISIBLE_DEVICES to control the number of workers
    c = Client(cluster)

    try:

        X, y = make_regression(n_samples=10000,
                               n_features=20,
                               n_informative=10,
                               random_state=123)

        X = X.astype(np.float32)
        y = y.astype(np.float32)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=1000)

        cu_rf_params = {
            'n_estimators': 50,
            'max_depth': 16,
            'n_bins': 16,
        }

        workers = c.has_what().keys()
        n_partitions = partitions_per_worker * len(workers)

        X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train))
        X_train_df = \
            dask_cudf.from_cudf(X_cudf, npartitions=n_partitions)

        y_cudf = np.array(pd.DataFrame(y_train).values)
        y_cudf = y_cudf[:, 0]
        y_cudf = cudf.Series(y_cudf)
        y_train_df = \
            dask_cudf.from_cudf(y_cudf, npartitions=n_partitions)
        X_cudf_test = cudf.DataFrame.from_pandas(pd.DataFrame(X_test))
        X_test_df = \
            dask_cudf.from_cudf(X_cudf_test, npartitions=n_partitions)

        X_train_df, y_train_df = dask_utils.persist_across_workers(
            c, [X_train_df, y_train_df], workers=workers)

        cu_rf_mg = cuRFR_mg(**cu_rf_params)
        cu_rf_mg.fit(X_train_df, y_train_df)

        cu_rf_mg_predict = cu_rf_mg.predict(X_test_df).compute()
        cu_rf_mg_predict = cp.asnumpy(cp.array(cu_rf_mg_predict))

        acc_score = r2_score(cu_rf_mg_predict, y_test)

        assert acc_score >= 0.67

    finally:
        c.close()
示例#5
0
def _prep_training_data(c, X_train, partitions_per_worker):
    workers = c.has_what().keys()
    n_partitions = partitions_per_worker * len(workers)

    X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train))

    X_train_df = dask_cudf.from_cudf(X_cudf, npartitions=n_partitions)
    X_train_df, = dask_utils.persist_across_workers(c, [X_train_df],
                                                    workers=workers)

    return X_train_df
    def ETL(self, columns=None, label_column=None, random_seed=0):
        """ Perfom ETL given a set of target dataset to prepare for model training. 
            1. Ingest parquet compressed dataset
            2. Drop samples with missing data [ predominantly cancelled flights ]
            3. Split dataset into train and test subsets 
        """
        with PerfTimer('ETL'):
            if 'single' in self.compute_type:
                if 'CPU' in self.compute_type:
                    from sklearn.model_selection import train_test_split
                    dataset = pandas.read_parquet(self.target_files,
                                                  columns=columns,
                                                  engine='pyarrow')
                    dataset = self.handle_missing_data(dataset)
                    X_train, X_test, y_train, y_test = train_test_split(
                        dataset.loc[:, dataset.columns != label_column],
                        dataset[label_column],
                        random_state=random_seed)
                elif 'GPU' in self.compute_type:
                    from cuml.preprocessing.model_selection import train_test_split
                    dataset = cudf.read_parquet(self.target_files,
                                                columns=columns)
                    dataset = self.handle_missing_data(dataset)
                    X_train, X_test, y_train, y_test = train_test_split(
                        dataset, label_column, random_state=random_seed)

            elif 'multi' in self.compute_type:
                from dask_ml.model_selection import train_test_split
                if 'CPU' in self.compute_type:
                    dataset = self.dask_cpu_parquet_ingest(self.target_files,
                                                           columns=columns)
                elif 'GPU' in self.compute_type:
                    dataset = self.dask_gpu_parquet_ingest(self.target_files,
                                                           columns=columns)

                dataset = self.handle_missing_data(dataset)

                # split [ always runs, regardless of whether dataset is cached ]
                train, test = train_test_split(dataset,
                                               random_state=random_seed)

                # build X [ features ], y [ labels ] for the train and test subsets
                y_train = train[label_column]
                X_train = train.drop(label_column, axis=1)
                y_test = test[label_column]
                X_test = test.drop(label_column, axis=1)

                if 'GPU' in self.compute_type:
                    X_train, y_train, X_test, y_test = persist_across_workers(
                        self.client, [X_train, y_train, X_test, y_test],
                        workers=self.client.has_what().keys())

            return X_train.astype('float32'), X_test.astype('float32'),\
                   y_train.astype('int32'), y_test.astype('int32')
示例#7
0
def _prep_training_data(c, X_train, y_train, partitions_per_worker):
    workers = c.has_what().keys()
    n_partitions = partitions_per_worker * len(workers)
    X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train))
    X_train_df = dask_cudf.from_cudf(X_cudf, npartitions=n_partitions)

    y_cudf = np.array(pd.DataFrame(y_train).values)
    y_cudf = y_cudf[:, 0]
    y_cudf = cudf.Series(y_cudf)
    y_train_df = dask_cudf.from_cudf(y_cudf, npartitions=n_partitions)

    X_train_df, y_train_df = dask_utils.persist_across_workers(
        c, [X_train_df, y_train_df], workers=workers)
    return X_train_df, y_train_df
示例#8
0
def distribute(X, y):
    # First convert to cudf (with real data, you would likely load in cuDF format to start)
    X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X))
    y_cudf = cudf.Series(y)

    # Partition with Dask
    # In this case, each worker will train on 1/n_partitions fraction of the data
    X_dask = dask_cudf.from_cudf(X_cudf, npartitions=n_partitions)
    y_dask = dask_cudf.from_cudf(y_cudf, npartitions=n_partitions)

    # Persist to cache the data in active memory
    X_dask, y_dask = \
      dask_utils.persist_across_workers(c, [X_dask, y_dask], workers=workers)
    
    return X_dask, y_dask
    def fit_random_forest(self, X_train, y_train, model_params):
        """
        Trains a RandomForest model on X_train and y_train with model_params.
        Depending on compute_type, estimators from appropriate packages are used.
        CPU - sklearn
        Single-GPU - cuml
        multi_gpu - cuml.dask

        Parameters and Objects returned are same as trained_model
        """
        if "CPU" in self.compute_type:
            rf_model = sklearn.ensemble.RandomForestClassifier(
                n_estimators=model_params["n_estimators"],
                max_depth=model_params["max_depth"],
                max_features=model_params["max_features"],
                n_jobs=int(self.n_workers),
                verbose=self.verbose_estimator,
            )
        elif "GPU" in self.compute_type:
            if "single" in self.compute_type:
                rf_model = cuml.ensemble.RandomForestClassifier(
                    n_estimators=model_params["n_estimators"],
                    max_depth=model_params["max_depth"],
                    n_bins=model_params["n_bins"],
                    max_features=model_params["max_features"],
                    verbose=self.verbose_estimator,
                )
            elif "multi" in self.compute_type:
                self.log_to_file("\n\tFitting multi-GPU daskRF")
                X_train, y_train = dask_utils.persist_across_workers(
                    self.client,
                    [X_train.fillna(0.0),
                     y_train.fillna(0.0)],
                    workers=self.workers,
                )
                rf_model = cuml.dask.ensemble.RandomForestClassifier(
                    n_estimators=model_params["n_estimators"],
                    max_depth=model_params["max_depth"],
                    n_bins=model_params["n_bins"],
                    max_features=model_params["max_features"],
                    verbose=self.verbose_estimator,
                )
        with PerfTimer() as train_timer:
            try:
                trained_model = rf_model.fit(X_train, y_train)
            except Exception as error:
                self.log_to_file("\n\n! Error during fit " + str(error))
        return trained_model, train_timer.duration
示例#10
0
def test_rf_regression(n_workers, partitions_per_worker):
    if dask_cuda.utils.get_n_gpus() < n_workers:
        pytest.skip("too few GPUs")

    cluster = LocalCUDACluster(threads_per_worker=1, n_workers=n_workers)
    c = Client(cluster)

    X, y = make_regression(n_samples=40000,
                           n_features=20,
                           n_informative=10,
                           random_state=123)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000)

    cu_rf_params = {
        'n_estimators': 25,
        'max_depth': 13,
    }

    workers = c.has_what().keys()
    n_partitions = partitions_per_worker * len(workers)

    X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train))
    X_train_df = \
        dask_cudf.from_cudf(X_cudf, npartitions=n_partitions)

    y_cudf = np.array(pd.DataFrame(y_train).values)
    y_cudf = y_cudf[:, 0]
    y_cudf = cudf.Series(y_cudf)
    y_train_df = \
        dask_cudf.from_cudf(y_cudf, npartitions=n_partitions)

    X_train_df, y_train_df = dask_utils.persist_across_workers(
        c, [X_train_df, y_train_df], workers=workers)

    cu_rf_mg = cuRFR_mg(**cu_rf_params)
    cu_rf_mg.fit(X_train_df, y_train_df)
    cu_rf_mg_predict = cu_rf_mg.predict(X_test)

    acc_score = r2_score(cu_rf_mg_predict, y_test)

    print(str(acc_score))

    assert acc_score >= 0.70

    c.close()
    cluster.close()
    def persist_training_inputs(self, X_train, y_train, X_test, y_test):
        """ In the case of dask multi-CPU and dask multi-GPU Random Forest, 
            we need the dataset to be computed/persisted prior to a fit call.
            In the case of XGBoost this step is performed by the DMatrix creation.
        """
        if 'multi-CPU' in self.compute_type:
            X_train = X_train.persist()
            y_train = y_train.persist()

        elif 'multi-GPU' in self.compute_type:
            from cuml.dask.common.utils import persist_across_workers
            X_train, y_train = persist_across_workers(
                self.client, [X_train, y_train, X_test, y_test],
                workers=self.client.has_what().keys())
            wait([X_train, y_train, X_test, y_test])

        return X_train, y_train, X_test, y_test
示例#12
0
def _prep_training_data(c,
                        X_train,
                        partitions_per_worker,
                        reverse_order=False):
    workers = c.has_what().keys()

    if reverse_order:
        workers = list(workers)[::-1]

    n_partitions = partitions_per_worker * len(workers)

    X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train))

    X_train_df = dask_cudf.from_cudf(X_cudf, npartitions=n_partitions)
    X_train_df, = dask_utils.persist_across_workers(c, [X_train_df],
                                                    workers=list(workers))

    return X_train_df
示例#13
0
def test_rf_regression_dask_cpu(partitions_per_worker, client):
    n_workers = len(client.scheduler_info()['workers'])

    X, y = make_regression(n_samples=n_workers * 2000,
                           n_features=20,
                           n_informative=10,
                           random_state=123)

    X = X.astype(np.float32)
    y = y.astype(np.float32)

    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=n_workers * 400,
                         random_state=123)

    cu_rf_params = {
        'n_estimators': 50,
        'max_depth': 16,
        'n_bins': 16,
    }

    workers = client.has_what().keys()
    n_partitions = partitions_per_worker * len(workers)

    X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train))
    X_train_df = \
        dask_cudf.from_cudf(X_cudf, npartitions=n_partitions)

    y_cudf = cudf.Series(y_train)
    y_train_df = \
        dask_cudf.from_cudf(y_cudf, npartitions=n_partitions)

    X_train_df, y_train_df = dask_utils.persist_across_workers(
        client, [X_train_df, y_train_df], workers=workers)

    cuml_mod = cuRFR_mg(**cu_rf_params)
    cuml_mod.fit(X_train_df, y_train_df)

    cuml_mod_predict = cuml_mod.predict(X_test, predict_model='CPU')

    acc_score = r2_score(cuml_mod_predict, y_test)

    assert acc_score >= 0.67
示例#14
0
    def ETL ( self, cached_dataset = None, random_seed = 0, 
              columns = airline_columns, label_column = airline_label_column ):
        """
            run ETL [  ingest -> rebalance -> drop missing -> split -> persist ]
            after the first run the dataset is cached, so only split is re-run (re-shuffled)
        """        
        with PerfTimer( 'ETL' ):
            if 'single' in self.compute_type:

                if 'CPU' in self.compute_type:
                    from sklearn.model_selection import train_test_split
                    dataset = pandas.read_parquet( self.target_files, columns = columns, engine='pyarrow' ) 
                    dataset = dataset.dropna()
                    X_train, X_test, y_train, y_test = train_test_split( dataset.loc[:, dataset.columns != label_column], 
                                                                         dataset[label_column], random_state = random_seed )

                elif 'GPU' in self.compute_type:
                    from cuml.preprocessing.model_selection import train_test_split
                    dataset = cudf.read_parquet( self.target_files, columns = columns )
                    dataset = dataset.dropna()                
                    X_train, X_test, y_train, y_test = train_test_split( dataset, label_column, random_state = random_seed )

                return X_train, X_test, y_train, y_test, dataset

            elif 'multi' in self.compute_type:
                from dask_ml.model_selection import train_test_split

                if cached_dataset is None:

                    if 'CPU' in self.compute_type:
                        dataset = dask.dataframe.read_parquet( self.target_files, columns = columns, engine='pyarrow') 
                    elif 'GPU' in self.compute_type:
                        dataset = dask_cudf.read_parquet( self.target_files, columns = columns )
                    
                    # drop missing values [ ~2.5% -- predominantly cancelled flights ]
                    dataset = dataset.dropna()

                    # repartition [ inplace ], rebalance ratio of workers & data partitions
                    initial_npartitions = dataset.npartitions    
                    dataset = dataset.repartition( npartitions = self.n_workers )
                    
                else:
                    print( f"using cache [ skiping ingestion, dropna, and repartition ]")                
                    if 'multi-CPU'in self.compute_type:
                        assert( type(cached_dataset) == dask.dataframe.core.DataFrame )
                    if 'multi-GPU'in self.compute_type:                    
                        assert( type(cached_dataset) == dask_cudf.core.DataFrame )

                    dataset = cached_dataset

                # split [ always runs, regardless of whether dataset is cached ]
                train, test = train_test_split( dataset, random_state = random_seed ) 

                # build X [ features ], y [ labels ] for the train and test subsets
                y_train = train[label_column].astype('int32')
                X_train = train.drop(label_column, axis = 1).astype('float32')

                y_test = test[label_column].astype('int32')
                X_test = test.drop(label_column, axis = 1).astype('float32')

                # force computation / persist
                if 'multi-CPU' in self.compute_type:
                    X_train = X_train.persist(); X_test = X_test.persist()
                    y_train = y_train.persist(); y_test = y_test.persist()

                elif 'multi-GPU' in self.compute_type:
                    workers = self.client.has_what().keys()
                    X_train, X_test, y_train, y_test = \
                        persist_across_workers( self.client,  [ X_train, X_test, y_train, y_test ],
                                                workers = workers )

                    wait( [X_train, X_test, y_train, y_test] );

                # return [ CPU/GPU ] dask dataframes 
                return X_train, X_test, y_train, y_test, dataset  
        
        return None