def test_load_from_data_encodes_data(): encoder = OneHotEncoder(['animal', 'color'], ['weight'], max_levels_default=100) data = [{'animal': 'cat', 'color': 'blue', 'weight': 1.0}, {'animal': 'cat', 'color': 'red', 'weight': 3.0}, {'animal': 'dog', 'color': 'yellow', 'weight': 5.5}, {'animal': 'fish', 'color': 'blue', 'weight': 7.0}, {'animal': 'cat', 'color': 'blue', 'weight': 2.0}, {'animal': 'cat', 'color': 'blue', 'weight': 0.0}, {'animal': 'cat', 'color': 'blue', 'weight': 99.9}] encoder.load_from_data_stream(data) encoded_data = [encoder.encode_row(row) for row in data] assert len(encoded_data) == len(data) assert len(encoded_data[0]) != len(data[0])
def test_load_from_data_encodes_data_correctly(): encoder = OneHotEncoder(['animal', 'color'], ['weight'], max_levels_default=100) data = [{'animal': 'cat', 'color': 'blue', 'weight': 6.0}, {'animal': 'cat', 'color': 'red', 'weight': 3.0}, {'animal': 'dog', 'color': 'yellow', 'weight': 5.5}, {'animal': 'fish', 'color': 'blue', 'weight': 7.0}, {'animal': 'cat', 'color': 'magenta', 'weight': 2.0}, {'animal': 'mouse', 'color': 'purple', 'weight': 0.0}, {'animal': 'mouse', 'color': 'black', 'weight': 99.9}] encoder.load_from_data_stream(data) encoded_data = [encoder.encode_row(row) for row in data] assert len(encoded_data) == len(data) assert len(encoded_data[0]) != len(data[0]) first_row = encoded_data[0] expected = [6.0, # weight is numeric and comes first 1.0, # animal is first categorical and cat is the most common, first row is cat 0.0, # animal, mouse is next most common, not a mouse 0.0, # animal, dog and fish tied for frequency but dog first alphabetically 0.0, # animal, fish, cat is not a fish 1.0, # color is next categorical alphabetically and blue is most common, first row blue 0.0, # black 0.0, # magenta 0.0, # purple 0.0, # red 0.0] # yellow assert first_row == expected second_row = encoded_data[1] expected = [3.0, # weight is numeric and comes first 1.0, # animal is first categorical and cat is the most common, first row is cat 0.0, # animal, mouse is next most common, not a mouse 0.0, # animal, dog and fish tied for frequency but dog first alphabetically 0.0, # animal, fish, cat is not a fish 0.0, # color is next categorical alphabetically and blue is most common, first row blue 0.0, # black next alphabetically for ones with frequency 1 0.0, # magenta next 0.0, # purple 1.0, # red, this is red 0.0] # yellow assert second_row == expected last_row = encoded_data[-1] expected = [99.9, # weight is numeric and comes first 0.0, # animal is first categorical and cat is the most common, first row is cat 1.0, # animal, mouse is next most common, not a mouse 0.0, # animal, dog and fish tied for frequency but dog first alphabetically 0.0, # animal, fish, cat is not a fish 0.0, # color is next categorical alphabetically and blue is most common, first row blue 1.0, # black next alphabetically for ones with frequency 1, this one black 0.0, # magenta next 0.0, # purple 0.0, # red 0.0] # yellow expected_total = [[6.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [3.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0], [5.5, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], [7.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [2.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0], [99.9, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]] assert encoded_data == expected_total