예제 #1
0
import pandas as pd
from unittest import TestCase  # or `from unittest import ...` if on Python 3.4+
import tests.helpers as th
import numpy as np

import category_encoders as encoders

np_X = th.create_array(n_rows=100)
np_X_t = th.create_array(n_rows=50, extras=True)
np_y = np.random.randn(np_X.shape[0]) > 0.5
np_y_t = np.random.randn(np_X_t.shape[0]) > 0.5
X = th.create_dataset(n_rows=100)
X_t = th.create_dataset(n_rows=50, extras=True)
y = pd.DataFrame(np_y)
y_t = pd.DataFrame(np_y_t)


class TestWeightOfEvidenceEncoder(TestCase):
    def test_woe(self):
        cols = [
            'unique_str', 'underscore', 'extra', 'none', 'invariant', 321,
            'categorical', 'na_categorical', 'categorical_int'
        ]

        # balanced label with balanced features
        X_balanced = pd.DataFrame(data=['1', '1', '1', '2', '2', '2'],
                                  columns=['col1'])
        y_balanced = [True, False, True, False, True, False]
        enc = encoders.WOEEncoder()
        enc.fit(X_balanced, y_balanced)
        X1 = enc.transform(X_balanced)
예제 #2
0
# loop times of benchmarking in every encoding, larger for more accurate but longer benchmarking time
benchmark_repeat = 3

# sample num of data
data_lines = 10000

# benchmarking result format
result_cols = [
    'encoder', 'used_processes', 'X_shape', 'min_time(s)', 'average_time(s)',
    'max_cpu_utilization(%)', 'average_cpu_utilization(%)'
]
results = []
cpu_utilization = multiprocessing.Manager().Queue()

# define data_set
np_X = th.create_array(n_rows=data_lines)
np_y = np.random.randn(np_X.shape[0]) > 0.5
X = th.create_dataset(n_rows=data_lines)
X_t = th.create_dataset(n_rows=int(data_lines / 2), extras=True)

cols = [
    'unique_str', 'underscore', 'extra', 'none', 'invariant', 321,
    'categorical', 'na_categorical'
]


def get_cpu_utilization():
    """
    new process for recording cpu utilization
    record cpu utilization every [cpu_sampling_rate] second & calculate its mean value
    the value is the cpu utilization during every encoding