示例#1
0
def test_stratified_binary_classification():
    X = cp.array([[0.37487513, -2.3031888, 1.662633, 0.7671007],
                  [-0.49796826, -1.0621182, -0.32518214, -0.20583323],
                  [-1.0104885, -2.4997945, 2.8952584, 1.4712684],
                  [2.008748, -2.4520662, 0.5557737, 0.07749569],
                  [0.97350526, -0.3403474, -0.58081895, -0.23199573]])

    # Needs to fail when we have just 1 occurence of a label
    y = cp.array([0, 0, 0, 0, 1])
    with pytest.raises(ValueError):
        train_test_split(X, y, train_size=0.75, stratify=y, shuffle=True)

    y = cp.array([0, 0, 0, 1, 1])

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.75,
                                                        stratify=y,
                                                        random_state=15)

    _, y_counts = cp.unique(y, return_counts=True)
    _, train_counts = cp.unique(y_train, return_counts=True)
    _, test_counts = cp.unique(y_test, return_counts=True)

    # Ensure we have preserve the number of labels
    cp.testing.assert_array_equal(train_counts + test_counts, y_counts)
示例#2
0
def train_test_split(X,
                     y=None,
                     shuffle=True,
                     random_state=None,
                     stratify=None,
                     **kwargs):
    if y is not None and str(y.dtype) == 'object':
        return _train_test_split_with_object(X,
                                             y,
                                             shuffle=shuffle,
                                             random_state=random_state,
                                             stratify=stratify,
                                             **kwargs)
    try:
        return cm_sel.train_test_split(X,
                                       y,
                                       shuffle=shuffle,
                                       random_state=random_state,
                                       stratify=stratify,
                                       **kwargs)
    except Exception as e:
        if stratify is not None and str(e).find('cudaErrorInvalidValue') >= 0:
            logger.warning('train_test_split failed, retry without stratify')
            return cm_sel.train_test_split(X,
                                           y,
                                           shuffle=shuffle,
                                           random_state=random_state,
                                           **kwargs)
        else:
            raise
示例#3
0
def test_random_state(seed_type):
    for i in range(10):
        seed_n = np.random.randint(0, int(1e9))
        if seed_type == 'int':
            seed = seed_n
        if seed_type == 'cupy':
            seed = cp.random.RandomState(seed=seed_n)
        if seed_type == 'numpy':
            seed = np.random.RandomState(seed=seed_n)
        X = cudf.DataFrame({"x": range(100)})
        y = cudf.Series(([0] * (100 // 2)) + ([1] * (100 // 2)))

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=seed)

        if seed_type == 'cupy':
            seed = cp.random.RandomState(seed=seed_n)
        if seed_type == 'numpy':
            seed = np.random.RandomState(seed=seed_n)

        X_train2, X_test2, y_train2, y_test2 = \
            train_test_split(X, y, random_state=seed)

        assert X_train.equals(X_train2)
        assert X_test.equals(X_test2)
        assert y_train.equals(y_train2)
        assert y_test.equals(y_test2)
示例#4
0
def test_split_dataframe_array(y_type):
    X = cudf.DataFrame({"x": range(100)})
    y = cudf.Series(([0] * (100 // 2)) + ([1] * (100 // 2)))
    if y_type == "cupy":
        X_train, X_test, y_train, y_test = train_test_split(X, y.values)
        assert isinstance(X_train, cudf.DataFrame)
        assert isinstance(X_test, cudf.DataFrame)
        assert isinstance(y_train, cp.ndarray)
        assert isinstance(y_test, cp.ndarray)
    elif y_type == "cudf":
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        assert isinstance(X_train, cudf.DataFrame)
        assert isinstance(X_test, cudf.DataFrame)
        assert isinstance(y_train, cudf.Series)
        assert isinstance(y_test, cudf.Series)
示例#5
0
def make_classification_dataset(datatype, nrows, ncols, nclasses):
    n_real_features = min(ncols, int(max(nclasses * 2, math.ceil(ncols / 10))))
    n_clusters_per_class = min(2, max(1, int(2**n_real_features / nclasses)))
    n_redundant = min(ncols - n_real_features, max(2, math.ceil(ncols / 20)))
    try:
        X, y = data.make_classification(
            dtype=datatype,
            n_samples=nrows + 1000,
            n_features=ncols,
            random_state=SEED,
            class_sep=1.0,
            n_informative=n_real_features,
            n_clusters_per_class=n_clusters_per_class,
            n_redundant=n_redundant,
            n_classes=nclasses)

        r = dsel.train_test_split(X, y, random_state=SEED, train_size=nrows)

        if len(cp.unique(r[2])) < nclasses:
            raise ValueError("Training data does not have all classes.")

        return r

    except ValueError:
        pytest.skip(
            "Skipping the test for invalid combination of ncols/nclasses")
def train_and_eval(X_param,
                   y_param,
                   penalty='l2',
                   C=1.0,
                   l1_ratio=None,
                   fit_intercept=True):
    """
        Splits the given data into train and test split to train and evaluate the model
        for the params parameters.
        
        Params
        ______
        
        X_param:  DataFrame. 
                  The data to use for training and testing. 
        y_param:  Series. 
                  The label for training
        penalty, C, l1_ratio, fit_intercept: The parameter values for Logistic Regression.

        Returns
        score: log loss of the fitted model
    """
    X_train, X_valid, y_train, y_valid = train_test_split(X_param,
                                                          y_param,
                                                          random_state=42)
    classifier = LogisticRegression(penalty=penalty,
                                    C=C,
                                    l1_ratio=l1_ratio,
                                    fit_intercept=fit_intercept)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_valid)
    score = log_loss(y_valid, y_pred)
    return score
示例#7
0
def test_split_array_single_argument(type, test_size, train_size, shuffle):
    X = np.zeros((100, 10)) + np.arange(100).reshape(100, 1)
    if type == 'cupy':
        X = cp.asarray(X)

    if type == 'numba':
        X = cuda.to_device(X)
    X_train, X_test = train_test_split(X,
                                       train_size=train_size,
                                       test_size=test_size,
                                       shuffle=shuffle,
                                       random_state=0)

    if type == 'cupy':
        assert isinstance(X_train, cp.ndarray)
        assert isinstance(X_test, cp.ndarray)

    if type in ['numba', 'rmm']:
        assert cuda.devicearray.is_cuda_ndarray(X_train)
        assert cuda.devicearray.is_cuda_ndarray(X_test)

    if train_size is not None:
        assert X_train.shape[0] == (int)(X.shape[0] * train_size)

    if test_size is not None:
        assert X_test.shape[0] == (int)(X.shape[0] * test_size)

    if shuffle is None:
        assert X_train == X[0:train_size]
        assert X_test == X[-1 * test_size:]

        X_rec = cp.sort(cp.concatenate(X_train, X_test))

        assert X_rec == X
示例#8
0
def test_pipeline():
    X, y = make_classification(random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    pipe = Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC())])
    pipe.fit(X_train, y_train)
    score = pipe.score(X_test, y_test)
    assert score > 0.8
示例#9
0
def test_split_column():
    data = cudf.DataFrame(
        {
            "x": range(100),
            "y": ([0] * (100 // 2)) + ([1] * (100 // 2)),
        }
    )
    train_size = 0.8

    X_train, X_test, y_train, y_test = train_test_split(
        data, "y", train_size=train_size
    )

    assert (
        len(X_train) == len(y_train) == pytest.approx(train_size * len(data))
    )
    assert (
        len(X_test)
        == len(y_test)
        == pytest.approx((1 - train_size) * len(data))
    )

    X_reconstructed = cudf.concat([X_train, X_test]).sort_values(
        by=["x"]
    )
    y_reconstructed = y_train.append(y_test).sort_values()

    assert all(
        data == X_reconstructed.assign(
            y=y_reconstructed).reset_index(drop=True)
    )
示例#10
0
def test_sklearn_search():
    """Test ensures scoring function works with sklearn machinery
    """
    import numpy as np
    from cuml import Ridge as cumlRidge
    import cudf
    from sklearn import datasets
    from sklearn.model_selection import train_test_split, GridSearchCV
    diabetes = datasets.load_diabetes()
    X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
                                                        diabetes.target,
                                                        test_size=0.2,
                                                        shuffle=False,
                                                        random_state=1)

    alpha = np.array([1.0])
    fit_intercept = True
    normalize = False

    params = {'alpha': np.logspace(-3, -1, 10)}
    cu_clf = cumlRidge(alpha=alpha,
                       fit_intercept=fit_intercept,
                       normalize=normalize,
                       solver="eig")

    assert getattr(cu_clf, 'score', False)
    sk_cu_grid = GridSearchCV(cu_clf, params, cv=5, iid=False)

    gdf_data = cudf.DataFrame(X_train)
    gdf_train = cudf.DataFrame(dict(train=y_train))

    sk_cu_grid.fit(gdf_data, gdf_train.train)
    assert sk_cu_grid.best_params_ == {'alpha': 0.1}
示例#11
0
def make_regression_dataset(datatype, nrows, ncols):
    ninformative = max(min(ncols, 5), int(math.ceil(ncols / 5)))
    X, y = data.make_regression(dtype=datatype,
                                n_samples=nrows + 1000,
                                n_features=ncols,
                                random_state=SEED,
                                n_informative=ninformative)
    return dsel.train_test_split(X, y, random_state=SEED, train_size=nrows)
示例#12
0
def test_split_df_single_argument(test_size, train_size, shuffle):
    X = cudf.DataFrame({'x': range(50)})
    X_train, X_test = train_test_split(X,
                                       train_size=train_size,
                                       test_size=test_size,
                                       shuffle=shuffle,
                                       random_state=0)
    if train_size is not None:
        assert X_train.shape[0] == (int)(X.shape[0] * train_size)

    if test_size is not None:
        assert X_test.shape[0] == (int)(X.shape[0] * test_size)
示例#13
0
def load_data(fpath):
    """
    Simple helper function for loading data to be used by CPU/GPU models.

    :param fpath: Path to the data to be ingested
    :return: DataFrame wrapping the data at [fpath]. Data will be in either a Pandas or RAPIDS (cuDF) DataFrame
    """
    import cudf

    df = cudf.read_parquet(fpath)
    X = df.drop(["ArrDelayBinary"], axis=1)
    y = df["ArrDelayBinary"].astype("int32")

    return train_test_split(X, y, test_size=0.2)
示例#14
0
def test_stratified_random_seed(seed_type):
    for i in range(10):
        seed_n = np.random.randint(0, int(1e9))
        if seed_type == 'int':
            seed = seed_n
        if seed_type == 'cupy':
            seed = cp.random.RandomState(seed=seed_n)
        if seed_type == 'numpy':
            seed = np.random.RandomState(seed=seed_n)
        X = cudf.DataFrame({"x": range(100)})
        y = cudf.Series(([0] * (100 // 2)) + ([1] * (100 // 2)))
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        random_state=seed,
                                                        stratify=y)

    if seed_type == 'cupy':
        seed = cp.random.RandomState(seed=seed_n)
    if seed_type == 'numpy':
        seed = np.random.RandomState(seed=seed_n)

    X_train2, X_test2, y_train2, y_test2 = \
        train_test_split(X, y, random_state=seed, stratify=y)

    assert X_train.equals(X_train2)
    assert X_test.equals(X_test2)
    assert y_train.equals(y_train2)
    assert y_test.equals(y_test2)

    # Ensure that data is shuffled
    assert not (X.head().index.values == X_train.head().index.values).all()

    def monotonic_inc(x):
        dx = cp.diff(x.values, axis=0)
        return cp.all(dx == 1)

    assert not monotonic_inc(X_train)
示例#15
0
    def _preprocess_data(self, train_data, labels, batch_size, train_size,
                         truncate):
        train_gdf = cudf.DataFrame()
        train_gdf["domain"] = train_data
        train_gdf["type"] = labels
        domain_train, domain_test, type_train, type_test = train_test_split(
            train_gdf, "type", train_size=train_size)
        test_df = self._create_df(domain_test, type_test)
        train_df = self._create_df(domain_train, type_train)

        test_dataset = DGADataset(test_df, truncate)
        train_dataset = DGADataset(train_df, truncate)

        test_dataloader = DataLoader(test_dataset, batchsize=batch_size)
        train_dataloader = DataLoader(train_dataset, batchsize=batch_size)
        return train_dataloader, test_dataloader
示例#16
0
def test_stratify_retain_index(test_size, train_size):
    X = cudf.DataFrame({"x": range(10)})
    y = cudf.Series(([0] * (10 // 2)) + ([1] * (10 // 2)))

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        train_size=train_size,
                                                        test_size=test_size,
                                                        shuffle=True,
                                                        stratify=True)
    assert (X_train["x"] == X_train.index).all()
    assert (X_test["x"] == X_test.index).all()

    if train_size is not None:
        assert X_train.shape[0] == (int)(X.shape[0] * train_size)

    elif test_size is not None:
        assert X_test.shape[0] == (int)(X.shape[0] * test_size)
示例#17
0
    def split_dataset(self, dataset, random_state):
        """
        Split dataset into train and test data subsets,
        currently using CV-fold index for randomness.
        Plan to refactor with sklearn KFold
        """

        hpo_log.info('> train-test split')
        label_column = self.hpo_config.label_column

        X_train, X_test, y_train, y_test = \
            train_test_split(dataset, label_column,
                             random_state=random_state)

        return (X_train.astype(self.hpo_config.dataset_dtype),
                X_test.astype(self.hpo_config.dataset_dtype),
                y_train.astype(self.hpo_config.dataset_dtype),
                y_test.astype(self.hpo_config.dataset_dtype))
示例#18
0
def test_default_values():
    X = np.zeros((100, 10)) + np.arange(100).reshape(100, 1)
    y = np.arange(100).reshape(100, 1)

    X = cp.asarray(X)
    y = cp.asarray(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y)

    assert isinstance(X_train, cp.ndarray)
    assert isinstance(X_test, cp.ndarray)
    assert isinstance(y_train, cp.ndarray)
    assert isinstance(y_test, cp.ndarray)

    assert X_train.shape[0] == X.shape[0] * 0.75
    assert y_train.shape[0] == y.shape[0] * 0.75

    assert X_test.shape[0] == X.shape[0] * 0.25
    assert y_test.shape[0] == y.shape[0] * 0.25
示例#19
0
def load_data(fpath):
    """
    Simple helper function for loading data to be used by CPU/GPU models.

    :param fpath: Path to the data to be ingested
    :return: DataFrame wrapping the data at [fpath]. Data will be in either a Pandas or RAPIDS (cuDF) DataFrame
    """
    import cudf

    if (fpath.startswith('gs://')):
        fs = gcsfs.GCSFileSystem()
        with fs.open(fpath, mode='rb') as f:
            df = cudf.read_parquet(f)
    else:
        df = cudf.read_parquet(fpath)

    X = df.drop(["ArrDelayBinary"], axis=1)
    y = df["ArrDelayBinary"].astype("int32")

    return train_test_split(X, y, test_size=0.2)
示例#20
0
def test_stratify_any_input(test_size, train_size):
    X = cudf.DataFrame({"x": range(10)})
    X['test_col'] = cudf.Series([10, 0, 0, 10, 10, 10, 0, 0, 10, 10])
    y = cudf.Series(([0] * (10 // 2)) + ([1] * (10 // 2)))

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=train_size,
                                                        test_size=test_size,
                                                        shuffle=True,
                                                        stratify=X['test_col'],
                                                        random_state=15)
    assert (X_train["x"].to_numpy() == X_train.index.to_numpy()).all()
    assert (X_test["x"].to_numpy() == X_test.index.to_numpy()).all()

    if train_size is not None:
        assert X_train.shape[0] == (int)(X.shape[0] * train_size)

    elif test_size is not None:
        assert X_test.shape[0] == (int)(X.shape[0] * test_size)
示例#21
0
def test_split_dataframe(train_size, shuffle):
    X = cudf.DataFrame({"x": range(100)})
    y = cudf.Series(([0] * (100 // 2)) + ([1] * (100 // 2)))

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=train_size,
                                                        shuffle=shuffle)
    assert len(X_train) == len(y_train) == pytest.approx(train_size * len(X))
    assert (len(X_test) == len(y_test) == pytest.approx(
        (1 - train_size) * len(X)))
    assert (all(X_train.index.to_pandas() == y_train.index.to_pandas()))
    assert (all(X_test.index.to_pandas() == y_test.index.to_pandas()))

    X_reconstructed = cudf.concat([X_train, X_test]).sort_values(by=["x"])
    y_reconstructed = y_train.append(y_test).sort_values()

    assert all(X_reconstructed.reset_index(drop=True) == X)
    out = y_reconstructed.reset_index(drop=True).values_host == y.values_host
    assert all(out)
def test_train_model():
    if torch.cuda.is_available():
        fake = Faker()
        email_col = [fake.text() for _ in range(200)]
        label_col = [random.randint(0, 1) for _ in range(200)]
        emails_gdf = cudf.DataFrame(list(zip(email_col, label_col)), columns=["email", "label"])
        X_train, X_test, y_train, y_test = train_test_split(
            emails_gdf, "label", train_size=0.8, random_state=10
        )
        sc.train_model(
            X_train["email"],
            y_train,
            learning_rate=3e-5,
            max_seq_len=128,
            batch_size=6,
            epochs=1,
        )
        assert isinstance(
            sc._model.module,
            transformers.models.bert.modeling_bert.BertForSequenceClassification,
        )
示例#23
0
def test_stratified_split(type, test_size, train_size):
    # For more tolerance and reliable estimates
    X, y = make_classification(n_samples=10000)

    if type == 'cupy':
        X = cp.asarray(X)
        y = cp.asarray(y)

    if type == 'numba':
        X = cuda.to_device(X)
        y = cuda.to_device(y)

    def counts(y):
        _, y_indices = cp.unique(y, return_inverse=True)
        class_counts = cp.bincount(y_indices)
        total = cp.sum(class_counts)
        percent_counts = []
        for count in (class_counts):
            percent_counts.append(
                cp.around(float(count) / total.item(), decimals=2).item())
        return percent_counts

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=train_size,
                                                        test_size=test_size,
                                                        stratify=y)

    original_counts = counts(y)
    split_counts = counts(y_train)
    assert cp.isclose(original_counts, split_counts, equal_nan=False,
                      rtol=0.1).all()
    if type == 'cupy':
        assert isinstance(X_train, cp.ndarray)
        assert isinstance(X_test, cp.ndarray)

    if type in ['numba']:
        assert cuda.devicearray.is_cuda_ndarray(X_train)
        assert cuda.devicearray.is_cuda_ndarray(X_test)
示例#24
0
def test_hinge_loss(nrows, ncols, n_info, input_type, n_classes):
    train_rows = np.int32(nrows * 0.8)
    X, y = make_classification(n_samples=nrows,
                               n_features=ncols,
                               n_clusters_per_class=1,
                               n_informative=n_info,
                               random_state=123,
                               n_classes=n_classes)

    if input_type == "cudf":
        X = cudf.DataFrame(X)
        y = cudf.Series(y)
    elif input_type == "cupy":
        X = cp.asarray(X)
        y = cp.asarray(y)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=train_rows,
                                                        shuffle=True)
    cuml_model = cu_log()
    cuml_model.fit(X_train, y_train)
    cu_predict_decision = cuml_model.decision_function(X_test)
    cu_loss = cuml_hinge(y_test, cu_predict_decision.T, labels=cp.unique(y))
    if input_type == "cudf":
        y_test = y_test.to_array()
        y = y.to_array()
        cu_predict_decision = cp.asnumpy(cu_predict_decision.values)
    elif input_type == "cupy":
        y = cp.asnumpy(y)
        y_test = cp.asnumpy(y_test)
        cu_predict_decision = cp.asnumpy(cu_predict_decision)

    cu_loss_using_sk = sk_hinge(y_test,
                                cu_predict_decision.T,
                                labels=np.unique(y))
    # compare the accuracy of the two models
    cp.testing.assert_array_almost_equal(cu_loss, cu_loss_using_sk)
示例#25
0
import joblib
import os
import cuml
from cuml.benchmark.datagen import load_higgs


OUTPUT_DIR = './outputs/'
os.makedirs(OUTPUT_DIR, exist_ok=True)

X, y = load_higgs()
N_ROWS = 1000000
run = Run.get_context()
client = ExplanationClient.from_run(run)
run.log('N_ROWS', N_ROWS)
X_train, X_test, y_train, y_test = train_test_split(X[:N_ROWS],
                                                    y[:N_ROWS],
                                                    random_state=1)
# write x_test out as a pickle file for later visualization
x_test_pkl = 'x_test.pkl'
with open(x_test_pkl, 'wb') as file:
    joblib.dump(value=X_test, filename=os.path.join(OUTPUT_DIR, x_test_pkl))
run.upload_file('x_test_higgs.pkl', os.path.join(OUTPUT_DIR, x_test_pkl))


gamma = 0.001
C = 100.
# Use SVC algorithm to create a model
reg = cuml.svm.SVC(C=C, gamma=gamma, probability=True)
model = reg.fit(X_train, y_train)

# preds = reg.predict(X_test)
示例#26
0
    def train_model(self,
                    train_gdf,
                    cat_cols,
                    cont_cols,
                    label_col,
                    batch_size,
                    epochs,
                    lr=0.01,
                    wd=0.0):
        """
        This function is used for training fastai tabular model with a given training dataset.

        :param train_gdf: training dataset with categorized and/or continuous feature columns
        :type train_gdf: cudf.DataFrame
        :param cat_cols: array of categorical column names in train_gdf
        :type label_col: array
        :param cont_col: array of continuous column names in train_gdf
        :type label_col: array
        :param label_col: column name of label column in train_gdf
        :type label_col: str
        :param batch_size: train_gdf will be partitioned into multiple dataframes of this size
        :type batch_size: int
        :param epochs: number of epochs to be adjusted depending on convergence for a specific dataset
        :type epochs: int
        :param lr: learning rate
        :type lr: float
        :param wd: wd
        :type wd: float

        Examples
        --------
        >>> from clx.analytics.asset_classification import AssetClassification
        >>> ac = AssetClassification()
        >>> cat_cols = ["1", "2", "3", "4", "5", "6", "7", "8", "9"]
        >>> cont_cols = ["10"]
        >>> ac.train_model(X_train, cat_cols, cont_cols, "label", batch_size, epochs, lr=0.01, wd=0.0)
        """

        self._cat_cols = cat_cols
        self._cont_cols = cont_cols

        # train/test split
        X, val_X, Y, val_Y = train_test_split(train_gdf,
                                              label_col,
                                              train_size=0.9)
        val_X.index = val_Y.index
        X.index = Y.index

        embedded_cols = {}
        for col in cat_cols:
            if col != label_col:
                categories_cnt = X[col].max() + 2
                if categories_cnt > 1:
                    embedded_cols[col] = categories_cnt

        X[label_col] = Y
        val_X[label_col] = val_Y

        # Embedding
        embedding_sizes = [(n_categories, min(100, (n_categories + 1) // 2))
                           for _, n_categories in embedded_cols.items()]

        n_cont = len(cont_cols)
        out_sz = train_gdf[label_col].nunique()

        # Partition dataframes
        train_part_dfs = self._get_partitioned_dfs(X, batch_size)
        val_part_dfs = self._get_partitioned_dfs(val_X, batch_size)

        self._model = TabularModel(embedding_sizes, n_cont, out_sz,
                                   self._layers, self._drops, self._emb_drop,
                                   self._is_reg, self._is_multi, self._use_bn)
        self._to_device(self._model, self._device)
        self._config_optimizer()
        for i in range(epochs):
            loss = self._train(self._model, self._optimizer, train_part_dfs,
                               cat_cols, cont_cols, label_col)
            print("training loss: ", loss)
            self._val_loss(self._model, val_part_dfs, cat_cols, cont_cols,
                           label_col)
示例#27
0
def classification_dataset(request):
    X, y = make_classification(n_samples=10, n_features=5, random_state=0)
    return train_test_split(X, y, random_state=0)
示例#28
0
def test_split_invalid_proportion(train_size):
    X = cudf.DataFrame({'x': range(10)})
    y = cudf.Series([0] * 10)

    with pytest.raises(ValueError):
        train_test_split(X, y, train_size=train_size)
示例#29
0
def test_split_size_mismatch():
    X = cudf.DataFrame({'x': range(3)})
    y = cudf.Series([0, 1])

    with pytest.raises(ValueError):
        train_test_split(X, y)
示例#30
0
def test_qn(loss, dtype, penalty, l1_strength, l2_strength, fit_intercept):

    if penalty == "none" and (l1_strength > 0 or l2_strength > 0):
        pytest.skip("`none` penalty does not take l1/l2_strength")

    tol = 1e-6

    qn = cuQN(loss=loss,
              fit_intercept=fit_intercept,
              l1_strength=l1_strength,
              l2_strength=l2_strength,
              tol=1e-8,
              output_type="cupy")

    if loss == 'softmax':
        X, y = make_classification(n_samples=5000,
                                   n_informative=10,
                                   n_features=20,
                                   n_classes=4,
                                   dtype=dtype)

        stratify = y.astype(dtype)
        X_train, X_test, y_train, y_test = train_test_split(X.astype(dtype),
                                                            y.astype(dtype),
                                                            stratify=stratify)
        most_class = cp.unique(y)[cp.argmax(cp.bincount(y))]

        baseline_preds = cp.array([most_class] * y_test.shape[0], dtype=dtype)
        baseline_score = accuracy_score(y_test, baseline_preds)

        y_pred = qn.fit(X_train, y_train).predict(X_test)
        cuml_score = accuracy_score(y_test, y_pred)

        assert (cuml_score > baseline_score)
        assert (cuml_score >= 0.50)

    elif loss == 'sigmoid':
        X = np.array(precomputed_X, dtype=dtype)
        y = np.array(precomputed_y_log, dtype=dtype)
        qn.fit(X, y)
        print(qn.objective)
        print(qn.coef_)

        if penalty == 'none' and l1_strength == 0.0 and l2_strength == 0.0:
            if fit_intercept:
                assert (qn.objective - 0.40263831615448) < tol
                cp.testing.assert_array_almost_equal(qn.coef_,
                                                     np.array([[-2.1088872],
                                                               [2.4812558]]),
                                                     decimal=3)
            else:
                assert (qn.objective - 0.4317452311515808) < tol
                cp.testing.assert_array_almost_equal(qn.coef_,
                                                     np.array([[-2.120777],
                                                               [3.056865]]),
                                                     decimal=3)
        elif penalty == 'l1' and l2_strength == 0.0:
            if fit_intercept:
                if l1_strength == 0.0:
                    assert (qn.objective - 0.40263831615448) < tol
                    cp.testing.assert_array_almost_equal(qn.coef_,
                                                         np.array(
                                                             [[-2.1088872],
                                                              [2.4812558]]),
                                                         decimal=3)
                else:
                    assert (qn.objective - 0.44295936822891235) < tol
                    cp.testing.assert_array_almost_equal(qn.coef_,
                                                         np.array(
                                                             [[-1.6899368],
                                                              [1.9021575]]),
                                                         decimal=3)

            else:
                if l1_strength == 0.0:
                    assert (qn.objective - 0.4317452311515808) < tol
                    cp.testing.assert_array_almost_equal(qn.coef_,
                                                         np.array([[-2.120777],
                                                                   [3.056865]
                                                                   ]),
                                                         decimal=3)

                else:
                    assert (qn.objective - 0.4769895672798157) < tol
                    cp.testing.assert_array_almost_equal(qn.coef_,
                                                         np.array(
                                                             [[-1.6214856],
                                                              [2.3650239]]),
                                                         decimal=3)

                # assert False

        elif penalty == 'l2' and l1_strength == 0.0:
            if fit_intercept:
                if l2_strength == 0.0:
                    assert (qn.objective - 0.40263831615448) < tol
                    cp.testing.assert_array_almost_equal(qn.coef_,
                                                         np.array(
                                                             [[-2.1088872],
                                                              [2.4812558]]),
                                                         decimal=3)
                else:
                    assert (qn.objective - 0.43780848383903503) < tol
                    cp.testing.assert_array_almost_equal(qn.coef_,
                                                         np.array(
                                                             [[-1.5337948],
                                                              [1.678699]]),
                                                         decimal=3)

            else:
                if l2_strength == 0.0:
                    assert (qn.objective - 0.4317452311515808) < tol
                    cp.testing.assert_array_almost_equal(qn.coef_,
                                                         np.array([[-2.120777],
                                                                   [3.056865]
                                                                   ]),
                                                         decimal=3)

                else:
                    assert (qn.objective - 0.4750209450721741) < tol
                    cp.testing.assert_array_almost_equal(qn.coef_,
                                                         np.array(
                                                             [[-1.3931049],
                                                              [2.0140104]]),
                                                         decimal=3)

        if penalty == 'elasticnet':
            if fit_intercept:
                if l1_strength == 0.0 and l2_strength == 0.0:
                    assert (qn.objective - 0.40263831615448) < tol
                    cp.testing.assert_array_almost_equal(qn.coef_,
                                                         np.array(
                                                             [[-2.1088872],
                                                              [2.4812558]]),
                                                         decimal=3)
                elif l1_strength == 0.0:
                    assert (qn.objective - 0.43780848383903503) < tol
                    cp.testing.assert_array_almost_equal(qn.coef_,
                                                         np.array(
                                                             [[-1.5337948],
                                                              [1.678699]]),
                                                         decimal=3)
                elif l2_strength == 0.0:
                    assert (qn.objective - 0.44295936822891235) < tol
                    cp.testing.assert_array_almost_equal(qn.coef_,
                                                         np.array(
                                                             [[-1.6899368],
                                                              [1.9021575]]),
                                                         decimal=3)
                else:
                    assert (qn.objective - 0.467987984418869) < tol
                    cp.testing.assert_array_almost_equal(qn.coef_,
                                                         np.array(
                                                             [[-1.3727235],
                                                              [1.4639963]]),
                                                         decimal=3)
            else:
                if l1_strength == 0.0 and l2_strength == 0.0:
                    assert (qn.objective - 0.4317452311515808) < tol
                    cp.testing.assert_array_almost_equal(qn.coef_,
                                                         np.array([[-2.120777],
                                                                   [3.056865]
                                                                   ]),
                                                         decimal=3)
                elif l1_strength == 0.0:
                    assert (qn.objective - 0.4750209450721741) < tol
                    cp.testing.assert_array_almost_equal(qn.coef_,
                                                         np.array(
                                                             [[-1.3931049],
                                                              [2.0140104]]),
                                                         decimal=3)

                elif l2_strength == 0.0:
                    assert (qn.objective - 0.4769895672798157) < tol
                    cp.testing.assert_array_almost_equal(qn.coef_,
                                                         np.array(
                                                             [[-1.6214856],
                                                              [2.3650239]]),
                                                         decimal=3)
                else:
                    assert (qn.objective - 0.5067970156669617) < tol
                    cp.testing.assert_array_almost_equal(qn.coef_,
                                                         np.array(
                                                             [[-1.2102532],
                                                              [1.752459]]),
                                                         decimal=3)

                print()