示例#1
0
    def test_dataframe_raises(self):
        df = pd.DataFrame({"A": ["a", "a", "b"]}, dtype="category")
        dpp.LabelEncoder().fit(df)  # OK

        df["other"] = ["a", "b", "c"]
        with pytest.raises(ValueError):
            dpp.LabelEncoder().fit(df)
示例#2
0
    def test_unseen_raises_array(self):
        enc = dpp.LabelEncoder().fit(y)
        new = da.from_array(np.array(["a", "a", "z"]), chunks=2)
        result = enc.transform(new)

        with pytest.raises(ValueError):
            result.compute()
示例#3
0
    def test_categorical(self, categories, transformed, daskify, ordered):
        cat = pd.Series(
            ["a", "b", "a"],
            dtype=pd.api.types.CategoricalDtype(categories=categories,
                                                ordered=ordered),
        )
        if daskify:
            cat = dd.from_pandas(cat, npartitions=2)
            transformed = da.from_array(transformed, chunks=(2, 1))
            if daskify == "unknown":
                cat = cat.cat.as_unknown()

        a = dpp.LabelEncoder().fit(cat)

        if daskify != "unknown":
            assert a.dtype_ == cat.dtype
        np.testing.assert_array_equal(a.classes_, categories)
        result = a.transform(cat)
        da.utils.assert_eq(result, transformed)

        inv_transformed = a.inverse_transform(result)
        if daskify:
            # manually set the divisions for the test
            inv_transformed.divisions = (0, 2)
        dd.utils.assert_eq(inv_transformed, cat)
示例#4
0
def build_bow_model(training, testing):
    vectorizer = CountVectorizer()
    encoder = preprocessing.LabelEncoder()

    print("Converting to Dask Databags...")
    X_train_db = db.from_sequence(training['X_trn'], npartitions=NUMBER_OF_CPU)
    X_test_db = db.from_sequence(testing['X_tst'], npartitions=NUMBER_OF_CPU)

    print("Building BoW...")
    X_model = vectorizer.fit(X_train_db)
    X_train = X_model.transform(X_train_db)
    X_test = X_model.transform(X_test_db)

    print("Indexing strings...")
    y_model = encoder.fit(training['y_trn'])
    y_train = y_model.transform(training['y_trn'])
    y_test = y_model.transform(testing['y_tst'])

    print("Computing chunks...")
    compute_chunks(X_train, y_train, X_test, y_test)

    print("Re-convert to Dask Array")
    Xtrain, Xtest = convert_X_data(X_train, X_test)

    return Xtrain, y_train, Xtest, y_test
示例#5
0
def main():
    # client = Client("tcp://127.0.0.1:64958")
    client = Client(processes=False, threads_per_worker=2, n_workers=1, memory_limit='4GB')
    print(client)

    rs = RandomSearcher(get_space_num_cat_pipeline_complex, optimize_direction=OptimizeDirection.Maximize)
    hk = HyperGBM(rs, task='classification', reward_metric='accuracy',
                  cache_dir=f'{test_output_dir}/hypergbm_cache',
                  callbacks=[SummaryCallback(), FileLoggingCallback(rs, output_dir=f'{test_output_dir}/hyn_logs')])

    df = dsutils.load_bank_by_dask()
    df.drop(['id'], axis=1)
    df['y'] = dm_pre.LabelEncoder().fit_transform(df['y'])
    # df = df.sample(frac=0.1)

    # object_columns = [i for i, v in df.dtypes.items() if v == 'object']
    # for c in object_columns:
    #     df[c] = df[c].astype('category')
    # df = df.categorize(object_columns)

    X_train, X_test = train_test_split(df, test_size=0.8, random_state=42)
    y_train = X_train.pop('y')
    y_test = X_test.pop('y')

    hk.search(X_train, y_train, X_test, y_test, max_trails=50)
    print('-' * 30)

    best_trial = hk.get_best_trail()
    print(f'best_train:{best_trial}')
    estimator = hk.final_train(best_trial.space_sample, X_train, y_train)
    score = estimator.predict(X_test)
    result = estimator.evaluate(X_test, y_test, metrics=['accuracy', 'auc', 'logloss'])
    print(f'final result:{result}')
示例#6
0
    def compute_class_weight(class_weight, *, classes, y):
        if not DaskToolBox.is_dask_object(y):
            return sk_utils.class_weight.compute_class_weight(class_weight,
                                                              classes=classes,
                                                              y=y)

        y = DaskToolBox.make_chunk_size_known(y)
        if set(dask.compute(da.unique(y))[0]) - set(classes):
            raise ValueError(
                "classes should include all valid labels that can be in y")

        if class_weight == 'balanced':
            # Find the weight of each class as present in y.
            le = dm_pre.LabelEncoder()
            y_ind = le.fit_transform(y)
            # if not all(np.in1d(classes, le.classes_)):
            #     raise ValueError("classes should have valid labels that are in y")
            # recip_freq = len(y) / (len(le.classes_) *
            #                        np.bincount(y_ind).astype(np.float64))
            # weight = recip_freq[le.transform(classes)]
            y_shape, y_ind_bincount, le_classes_ = dask.compute(
                y.shape, da.bincount(y_ind), le.classes_)
            if not all(np.in1d(classes, le_classes_)):
                raise ValueError(
                    "classes should have valid labels that are in y")
            recip_freq = y_shape[0] / (len(le_classes_) *
                                       y_ind_bincount.astype(np.float64))
            weight = recip_freq[np.searchsorted(le_classes_, classes)]
        else:
            raise ValueError("Only class_weight == 'balanced' is supported.")

        return weight
示例#7
0
 def _fit_array(self, X, y=None):
     n_features = X.shape[1]
     for n in range(n_features):
         le = dm_pre.LabelEncoder()
         le.fit(X[:, n])
         self.encoders[n] = le
     return self
示例#8
0
    def test_basic(self):
        a = dpp.LabelEncoder()
        b = spp.LabelEncoder()

        a.fit(y)
        b.fit(y.compute())
        assert_estimator_equal(a, b)
示例#9
0
    def test_basic(self):
        a = dpp.LabelEncoder()
        b = spp.LabelEncoder()

        a.fit(y)
        b.fit(y.compute())
        exclude = {"dtype_"}
        assert_estimator_equal(a, b, exclude=exclude)
示例#10
0
    def test_transform(self, array):
        a = dpp.LabelEncoder()
        b = spp.LabelEncoder()

        a.fit(array)
        b.fit(array.compute())

        assert_eq_ar(a.transform(array).compute(), b.transform(array.compute()))
示例#11
0
    def test_input_types(self, dask_array, pandas_series):
        a = dpp.LabelEncoder()
        b = spp.LabelEncoder()

        assert_estimator_equal(a.fit(dask_array), b.fit(pandas_series))

        assert_estimator_equal(a.fit(pandas_series), b.fit(pandas_series))

        assert_estimator_equal(a.fit(pandas_series.values),
                               b.fit(pandas_series))

        assert_estimator_equal(a.fit(dask_array), b.fit(pandas_series.values))
示例#12
0
def to_parquet(sales_series, file_name, processed_dir, LOG):
    LOG.debug('Setting index')
    sales_series = sales_series.set_index(sales_series['id'])
    LOG.debug('Setting index - done')
    encoders = {}
    # TODO: dask supposedly does this on its own with sensible defaults
    # sales_series['parquet_partition'] = np.random.randint(0, 100, sales_series.shape[0])

    # this one is a dup of day_date_str which is harder to squeeze through the rest of the pipeline (yay petastorm)
    if 'day_date' in sales_series.columns:
        LOG.debug(f"Dropping 'day_date' from {sales_series.columns}")
        sales_series = sales_series.drop(['day_date'], axis=1)

    for col in sales_series.columns:
        if col in encoders:
            LOG.debug(f'Skipping: {col} - already encoded')
            continue

        # petastorm can't read these
        if str(sales_series[col].dtype) == 'uint8':
            sales_series[col] = sales_series[col].astype('int')

        if str(sales_series[col].dtype) in ['category', 'object']:
            LOG.debug(f'Encoding: {col}')
            enc = dask_preprocessing.LabelEncoder()
            #enc = LabelEncoder()
            sales_series[col] = enc.fit_transform(sales_series[col])
            # TODO: update other transforms too!
            encoders[col] = enc

    for name, enc in encoders.items():
        LOG.debug(f"Saving encoder: {name}")
        np.save(f'{processed_dir}/{name}.npy', enc.classes_)

    # TODO: uint -> int, category/object -> int, day_date -> drop
    # TODO: this is being called both on dask and pandas data frames and args are rather not compatible :/
    parquet_file = f'{processed_dir}/{file_name}'
    LOG.debug(f"Saving {type(sales_series)} to {parquet_file}")
    kwargs = {}
    is_pandas_df = type(sales_series) == pd.DataFrame
    index_kwarg_name = 'index' if is_pandas_df else 'write_index'
    kwargs[index_kwarg_name] = False

    sales_series.to_parquet(
        parquet_file,
        **kwargs
#        partition_cols=['parquet_partition']
    )
示例#13
0
    def test_use_categorical(self, daskify):
        data = pd.Series(["b", "c"],
                         dtype=pd.api.types.CategoricalDtype(["c", "a", "b"]))
        if daskify:
            data = dd.from_pandas(data, npartitions=2)
        a = dpp.LabelEncoder(use_categorical=False).fit(data)
        b = spp.LabelEncoder().fit(data)
        assert_estimator_equal(a, b, exclude={"dtype_"})
        assert a.dtype_ is None

        da.utils.assert_eq(a.transform(data), b.transform(data))
        a_trn = a.transform(data)
        b_trn = b.transform(data)
        da.utils.assert_eq(a_trn, b_trn)
        da.utils.assert_eq(a.inverse_transform(a_trn),
                           b.inverse_transform(b_trn))
示例#14
0
def transform(data):
    for feature in cat_features:
        encoder = preprocessing.LabelEncoder()
        data[feature] = encoder.fit_transform(data[feature])
    return encoder,data
示例#15
0
 def test_fit_transform_categorical(self):
     cat = dd.from_pandas(pd.Series(choices, dtype="category"),
                          npartitions=4)
     result = dpp.LabelEncoder().fit_transform(cat)
     assert result.dtype == "int8"
     assert result.dtype == result.compute().dtype
示例#16
0
 def test_transform_dtypes(self, array):
     result = dpp.LabelEncoder().fit_transform(array)
     assert result.dtype == np.intp
     if dask.is_dask_collection(array):
         assert result.dtype == result.compute().dtype
示例#17
0
import xgboost as xgb


cluster = LocalCluster(n_workers=16, threads_per_worker=1)
client = Client(cluster)

d_train = pd.read_csv("https://s3.amazonaws.com/benchm-ml--main/train-1m.csv")
d_test = pd.read_csv("https://s3.amazonaws.com/benchm-ml--main/test.csv")
d_all = pd.concat([d_train,d_test])

dx_all = dd.from_pandas(d_all, npartitions=16)

vars_cat = ["Month","DayofMonth","DayOfWeek","UniqueCarrier", "Origin", "Dest"]
vars_num = ["DepTime","Distance"]
for col in vars_cat:
  dx_all[col] = preprocessing.LabelEncoder().fit_transform(dx_all[col])
  
X_all = dx_all[vars_cat+vars_num].to_dask_array(lengths=True)      
y_all = da.where((dx_all["dep_delayed_15min"]=="Y").to_dask_array(lengths=True),1,0)  

X_train = X_all[0:d_train.shape[0],]
y_train = y_all[0:d_train.shape[0]]
X_test = X_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0]),]
y_test = y_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0])]

X_train.persist()
y_train.persist()

client.has_what()

示例#18
0
    def test_inverse_transform(self, array):

        a = dpp.LabelEncoder()
        assert_eq_ar(a.inverse_transform(a.fit_transform(array)),
                     da.asarray(array))
def transform(data):
    cat = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id','event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in cat:
        encoder = preprocessing.LabelEncoder()
        data[feature] = encoder.fit_transform(data[feature])
    return encoder,data