Пример #1
0
def test_load_from_data_encodes_data():
    encoder = OneHotEncoder(['animal', 'color'], ['weight'], max_levels_default=100)
    data = [{'animal': 'cat', 'color': 'blue', 'weight': 1.0},
            {'animal': 'cat', 'color': 'red', 'weight': 3.0},
            {'animal': 'dog', 'color': 'yellow', 'weight': 5.5},
            {'animal': 'fish', 'color': 'blue', 'weight': 7.0},
            {'animal': 'cat', 'color': 'blue', 'weight': 2.0},
            {'animal': 'cat', 'color': 'blue', 'weight': 0.0},
            {'animal': 'cat', 'color': 'blue', 'weight': 99.9}]

    encoder.load_from_data_stream(data)

    encoded_data = [encoder.encode_row(row) for row in data]
    assert len(encoded_data) == len(data)
    assert len(encoded_data[0]) != len(data[0])
Пример #2
0
def test_load_from_data_encodes_data_correctly():
    encoder = OneHotEncoder(['animal', 'color'], ['weight'], max_levels_default=100)

    data = [{'animal': 'cat', 'color': 'blue', 'weight': 6.0},
            {'animal': 'cat', 'color': 'red', 'weight': 3.0},
            {'animal': 'dog', 'color': 'yellow', 'weight': 5.5},
            {'animal': 'fish', 'color': 'blue', 'weight': 7.0},
            {'animal': 'cat', 'color': 'magenta', 'weight': 2.0},
            {'animal': 'mouse', 'color': 'purple', 'weight': 0.0},
            {'animal': 'mouse', 'color': 'black', 'weight': 99.9}]

    encoder.load_from_data_stream(data)

    encoded_data = [encoder.encode_row(row) for row in data]
    assert len(encoded_data) == len(data)
    assert len(encoded_data[0]) != len(data[0])

    first_row = encoded_data[0]

    expected = [6.0,  # weight is numeric and comes first
                1.0,  # animal is first categorical and cat is the most common, first row is cat
                0.0,  # animal, mouse is next most common, not a mouse
                0.0,  # animal, dog and fish tied for frequency but dog first alphabetically
                0.0,  # animal, fish, cat is not a fish
                1.0,  # color is next categorical alphabetically and blue is most common, first row blue
                0.0,  # black
                0.0,  # magenta
                0.0,  # purple
                0.0,  # red
                0.0]  # yellow
    assert first_row == expected

    second_row = encoded_data[1]

    expected = [3.0,  # weight is numeric and comes first
                1.0,  # animal is first categorical and cat is the most common, first row is cat
                0.0,  # animal, mouse is next most common, not a mouse
                0.0,  # animal, dog and fish tied for frequency but dog first alphabetically
                0.0,  # animal, fish, cat is not a fish
                0.0,  # color is next categorical alphabetically and blue is most common, first row blue
                0.0,  # black next alphabetically for ones with frequency 1
                0.0,  # magenta next
                0.0,  # purple
                1.0,  # red, this is red
                0.0]  # yellow
    assert second_row == expected

    last_row = encoded_data[-1]

    expected = [99.9,  # weight is numeric and comes first
                0.0,  # animal is first categorical and cat is the most common, first row is cat
                1.0,  # animal, mouse is next most common, not a mouse
                0.0,  # animal, dog and fish tied for frequency but dog first alphabetically
                0.0,  # animal, fish, cat is not a fish
                0.0,  # color is next categorical alphabetically and blue is most common, first row blue
                1.0,  # black next alphabetically for ones with frequency 1, this one black
                0.0,  # magenta next
                0.0,  # purple
                0.0,  # red
                0.0]  # yellow

    expected_total = [[6.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                      [3.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
                      [5.5, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
                      [7.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                      [2.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
                      [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
                      [99.9, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]]

    assert encoded_data == expected_total