def test_save_load(): filename = NamedTemporaryFile().name encoder = OneHotEncoder(['animal', 'color'], ['weight'], max_levels_default=100) data = [{'animal': 'cat', 'color': 'blue', 'weight': 6.0}, {'animal': 'cat', 'color': 'red', 'weight': 3.0}, {'animal': 'dog', 'color': 'yellow', 'weight': 5.5}, {'animal': 'fish', 'color': 'blue', 'weight': 7.0}, {'animal': 'cat', 'color': 'magenta', 'weight': 2.0}, {'animal': 'mouse', 'color': 'purple', 'weight': 0.0}, {'animal': 'mouse', 'color': 'black', 'weight': 99.9}] encoder.load_from_data_stream(data) encoded_data = encoder.encode_data(data) encoder.save(filename) encoder_from_file = OneHotEncoder([], []) encoder_from_file.load_from_file(filename) encoded_data_from_file = encoder_from_file.encode_data(data) assert encoded_data == encoded_data_from_file
def test_inversion_more_complicated_with_max_levels(): encoder = OneHotEncoder({'animal': 2, 'color': 2}, ['weight', 'height']) data = [{'animal': 'cat', 'color': 'blue', 'weight': 6.0, 'height': 88.9}, {'animal': 'cat', 'color': 'red', 'weight': 3.0, 'height': 44.9}, {'animal': 'dog', 'color': 'yellow', 'weight': 5.5, 'height': 2.5}, {'animal': 'fish', 'color': 'blue', 'weight': 7.0, 'height': 3233.2}, {'animal': 'cat', 'color': 'magenta', 'weight': 2.0, 'height': 666.6}, {'animal': 'mouse', 'color': 'red', 'weight': 0.0, 'height': 55.5}, {'animal': 'mouse', 'color': 'blah', 'weight': 99.9, 'height': 33}] encoder.load_from_data_stream(data) encoded_data = encoder.encode_data(data) data_decoded = encoder.decode_data(encoded_data) expected = [{'height': 88.9, 'weight': 6.0, 'animal': 'cat', 'color': 'blue'}, {'height': 44.9, 'weight': 3.0, 'animal': 'cat', 'color': 'red'}, {'height': 2.5, 'weight': 5.5, 'color': 'UNKNOWN_CATEGORICAL_LEVEL', 'animal': 'UNKNOWN_CATEGORICAL_LEVEL'}, {'height': 3233.2, 'weight': 7.0, 'color': 'blue', 'animal': 'UNKNOWN_CATEGORICAL_LEVEL'}, {'height': 666.6, 'weight': 2.0, 'animal': 'cat', 'color': 'UNKNOWN_CATEGORICAL_LEVEL'}, {'height': 55.5, 'weight': 0.0, 'animal': 'mouse', 'color': 'red'}, {'height': 33, 'weight': 99.9, 'animal': 'mouse', 'color': 'UNKNOWN_CATEGORICAL_LEVEL'}] assert data_decoded == expected
def run_example(stats=False): encoder = OneHotEncoder({'animal': 2, 'color': 1}, ['weight', 'height']) data = [{'animal': 'cat', 'color': 'blue', 'weight': 6.0, 'height': 88.9, 'extra_junk': 'blah'}, {'animal': 'cat', 'color': 'red', 'weight': 3.0, 'height': 44.9}, {'animal': 'dog', 'color': 'yellow', 'weight': 5.5, 'height': 2.5}, {'animal': 'fish', 'color': 'blue', 'weight': 7.0, 'height': 3233.2}, {'animal': 'cat', 'color': 'magenta', 'weight': 2.0, 'height': 666.6}, {'animal': 'mouse', 'color': 'red', 'weight': 0.0, 'height': 55.5}, {'animal': 'mouse', 'color': 'blah', 'weight': 99.9, 'height': 33}] encoder.load_from_data_stream(data) encoded_data = encoder.encode_data(data) data_decoded = encoder.decode_data(encoded_data) expected = [{'height': 88.9, 'weight': 6.0, 'animal': 'cat', 'color': 'blue'}, {'height': 44.9, 'weight': 3.0, 'animal': 'cat', 'color': 'UNKNOWN_CATEGORICAL_LEVEL'}, {'height': 2.5, 'weight': 5.5, 'color': 'UNKNOWN_CATEGORICAL_LEVEL', 'animal': 'UNKNOWN_CATEGORICAL_LEVEL'}, {'height': 3233.2, 'weight': 7.0, 'color': 'blue', 'animal': 'UNKNOWN_CATEGORICAL_LEVEL'}, {'height': 666.6, 'weight': 2.0, 'animal': 'cat', 'color': 'UNKNOWN_CATEGORICAL_LEVEL'}, {'height': 55.5, 'weight': 0.0, 'animal': 'mouse', 'color': 'UNKNOWN_CATEGORICAL_LEVEL'}, {'height': 33, 'weight': 99.9, 'animal': 'mouse', 'color': 'UNKNOWN_CATEGORICAL_LEVEL'}] assert data_decoded == expected # add number stats? if stats: encoder.add_numeric_stats(data) # check the package packaged = encoder.package_data() return packaged
def test_inversion_more_complicated(): encoder = OneHotEncoder(['animal', 'color'], ['weight', 'height'], max_levels_default=100) data = [{'animal': 'cat', 'color': 'blue', 'weight': 6.0, 'height': 88.9}, {'animal': 'cat', 'color': 'red', 'weight': 3.0, 'height': 44.9}, {'animal': 'dog', 'color': 'yellow', 'weight': 5.5, 'height': 2.5}, {'animal': 'fish', 'color': 'blue', 'weight': 7.0, 'height': 3233.2}, {'animal': 'cat', 'color': 'magenta', 'weight': 2.0, 'height': 666.6}, {'animal': 'mouse', 'color': 'red', 'weight': 0.0, 'height': 55.5}, {'animal': 'mouse', 'color': 'blah', 'weight': 99.9, 'height': 33}] encoder.load_from_data_stream(data) encoded_data = encoder.encode_data(data) data_decoded = encoder.decode_data(encoded_data) assert data_decoded == data data_recoded = encoder.encode_data(data_decoded) assert data_recoded == encoded_data
def test_inversion(): encoder = OneHotEncoder(['animal', 'color'], ['weight'], max_levels_default=100) data = [{'animal': 'cat', 'color': 'blue', 'weight': 6.0}, {'animal': 'cat', 'color': 'red', 'weight': 3.0}, {'animal': 'dog', 'color': 'yellow', 'weight': 5.5}, {'animal': 'fish', 'color': 'blue', 'weight': 7.0}, {'animal': 'cat', 'color': 'magenta', 'weight': 2.0}, {'animal': 'mouse', 'color': 'purple', 'weight': 0.0}, {'animal': 'mouse', 'color': 'black', 'weight': 99.9}] encoder.load_from_data_stream(data) encoded_data = encoder.encode_data(data) data_decoded = encoder.decode_data(encoded_data) assert data_decoded == data data_recoded = encoder.encode_data(data_decoded) assert data_recoded == encoded_data
def get_round_trip_decoded(stats=False, omit_cols=None): encoder = OneHotEncoder({'animal': 2, 'color': 1}, ['weight', 'height'], omit_cols=omit_cols) data = [{'animal': 'cat', 'color': 'blue', 'weight': 6.0, 'height': 88.9, 'extra_junk': 'blah'}, {'animal': 'cat', 'color': 'red', 'weight': 3.0, 'height': 44.9}, {'animal': 'dog', 'color': 'yellow', 'weight': 5.5, 'height': 2.5}, {'animal': 'fish', 'color': 'blue', 'weight': 7.0, 'height': 3233.2}, {'animal': 'cat', 'color': 'magenta', 'weight': 2.0, 'height': 666.6}, {'animal': 'mouse', 'color': 'red', 'weight': 0.0, 'height': 55.5}, {'animal': 'mouse', 'color': 'blah', 'weight': 99.9, 'height': 33}] encoder.load_from_data_stream(data) encoded_data = encoder.encode_data(data) data_decoded = encoder.decode_data(encoded_data) # add number stats? if stats: encoder.add_numeric_stats(data) # check the package packaged = encoder.package_data() return data_decoded, packaged