def test_to_array_method(): m1 = autom8.create_matrix([[1], [2], [3], [4]]) m2 = autom8.create_matrix([[1, 2], [3, 4], [5, 6]]) assert np.array_equal(m1.to_array(), np.array([1, 2, 3, 4])) with pytest.raises(autom8.Autom8Exception) as excinfo: m2.to_array() excinfo.match('Expected.*one column')
def test_columns_with_numbers_as_strings(): dataset = [ ['A', 'B', 'C'], ['1.1', '$4', 7], ['2.2', '$5', 8], ['3.3', '6%', 9], ] acc = autom8.Accumulator() matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc) ctx = autom8.create_context(matrix, receiver=acc) autom8.clean_dataset(ctx) assert len(acc.warnings) == 0 assert len(ctx.steps) == 2 assert ctx.matrix.tolist() == [[1.1, 4, 7], [2.2, 5, 8], [3.3, 6, 9]] vectors = [['A', 'B', 'C'], [1, '2%', 'foo'], ['3', 4.0, 'bar']] matrix = autom8.create_matrix(vectors, receiver=acc) out = PlaybackContext(matrix, receiver=acc) playback(ctx.steps, out) assert out.matrix.tolist() == [[1, 2, 'foo'], [3, 4, 'bar']] assert out.matrix.columns[0].dtype == int assert out.matrix.columns[1].dtype == float
def test_column_of_ints_and_floats(): dataset = [ ['A', 'B'], [1, 3.3], [2.2, 4], [None, None], ] acc = autom8.Accumulator() matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc) ctx = autom8.create_context(matrix, receiver=acc) autom8.clean_dataset(ctx) assert len(ctx.steps) == 4 assert len(acc.warnings) == 2 assert ctx.matrix.tolist() == [ [1.0, True, 3.3, True], [2.2, True, 4.0, True], [0.0, False, 0.0, False], ] vectors = [['A', 'B'], [None, 10], [20.0, None], [30, 40]] matrix = autom8.create_matrix(vectors, receiver=acc) out = PlaybackContext(matrix, receiver=acc) playback(ctx.steps, out) assert out.matrix.tolist() == [ [0.0, False, 10.0, True], [20.0, True, 0.0, False], [30.0, True, 40.0, True], ] assert out.matrix.columns[0].dtype == float assert out.matrix.columns[2].dtype == float
def test_tolist_method(): m1 = autom8.create_matrix( dataset=[['hi', True], ['bye', False]], column_names=['msg', 'flag'], column_roles=['textual', 'encoded'], ) m2 = autom8.create_matrix([[1, 2.0], [3, 4.0], [5, 6.0]]) assert m1.tolist() == [['hi', True], ['bye', False]] assert m2.tolist() == [[1, 2.0], [3, 4.0], [5, 6.0]]
def test_len_method(): m1 = autom8.create_matrix([ ['hi', 1, True], ['so', 2, True], ['bye', 3, False], ]) m2 = autom8.create_matrix([[1], [2], [3], [4], [5], [6], [7]]) assert len(m1) == 3 assert len(m2) == 7
def test_drop_columns_by_index(): m1 = autom8.create_matrix([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) m2 = autom8.create_matrix([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) m1.drop_columns_by_index([0, 2]) m2.drop_columns_by_index([1, 2]) assert len(m1.columns) == 1 assert len(m2.columns) == 2 assert np.array_equal(m1.columns[0].values, np.array([2, 5, 8])) assert m1.tolist() == [[2], [5], [8]] assert m2.tolist() == [[1, 4], [5, 8], [9, 12]]
def test_mixed_up_columns_with_strings_and_numbers(): dataset = [ ['A', 'B'], [True, 'foo'], [1.1, 30], [20, 4.4], ['bar', False], ['', 'baz'], [50, 'fiz'], [None, True], ] acc = autom8.Accumulator() matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc) ctx = autom8.create_context(matrix, receiver=acc) autom8.clean_dataset(ctx) assert len(ctx.steps) == 6 assert len(acc.warnings) == 0 assert ctx.matrix.tolist() == [ [1.0, '', 0.0, 'foo'], [1.1, '', 30.0, ''], [20.0, '', 4.4, ''], [0.0, 'bar', 0.0, ''], [0.0, '', 0.0, 'baz'], [50.0, '', 0.0, 'fiz'], [0.0, '', 1.0, ''], ] assert ctx.matrix.formulas == [ ['number', 'A'], ['string', 'A'], ['number', 'B'], ['string', 'B'], ] vectors = [['A', 'B'], [False, 'buz'], ['zim', 10], [2, None]] matrix = autom8.create_matrix(vectors, receiver=acc) out = PlaybackContext(matrix, receiver=acc) playback(ctx.steps, out) assert out.matrix.tolist() == [ [0.0, '', 0.0, 'buz'], [0.0, 'zim', 10.0, ''], [2.0, '', 0.0, ''], ] assert out.matrix.formulas == [ ['number', 'A'], ['string', 'A'], ['number', 'B'], ['string', 'B'], ]
def test_columns_with_some_empty_strings(): dataset = [ ['A', 'B', 'C'], [True, 1.1, 20], ['', 2.2, 30], [False, '', 40], [False, 3.3, ''], ['', 4.4, ''], ] acc = autom8.Accumulator() matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc) ctx = autom8.create_context(matrix, receiver=acc) autom8.clean_dataset(ctx) assert len(ctx.steps) == 6 assert len(acc.warnings) == 3 assert ctx.matrix.tolist() == [ [True, True, 1.1, True, 20, True], [False, False, 2.2, True, 30, True], [False, True, 0.0, False, 40, True], [False, True, 3.3, True, 0, False], [False, False, 4.4, True, 0, False], ] assert ctx.matrix.formulas == [ 'A', ['is-defined', 'A'], 'B', ['is-defined', 'B'], 'C', ['is-defined', 'C'], ] vectors = [['A', 'B', 'C'], ['', 5.5, ''], [True, '', 50]] matrix = autom8.create_matrix(vectors, receiver=acc) out = PlaybackContext(matrix, receiver=acc) playback(ctx.steps, out) assert out.matrix.tolist() == [ [False, False, 5.5, True, 0, False], [True, True, 0.0, False, 50, True], ] assert out.matrix.formulas == [ 'A', ['is-defined', 'A'], 'B', ['is-defined', 'B'], 'C', ['is-defined', 'C'], ]
def test_creating_simple_matrix_with_names_and_roles(): acc = autom8.Accumulator() matrix = autom8.create_matrix( dataset=[['hi', True], ['bye', False]], column_names=['msg', 'flag'], column_roles=['textual', 'encoded'], receiver=acc, ) c1, c2 = matrix.columns e1 = np.array(['hi', 'bye'], dtype=object) e2 = np.array([True, False], dtype=None) assert np.array_equal(c1.values, e1) assert np.array_equal(c2.values, e2) assert c1.name == 'msg' assert c2.name == 'flag' assert c1.role == 'textual' assert c2.role == 'encoded' assert c1.is_original assert c2.is_original assert len(acc.warnings) == 0
def test_column_with_some_blank_strings(): # Repeat the previous test, only replace most of the empty strings with # blank strings. dataset = [ ['A', 'B', 'C'], [True, 1.1, 20], [' ', 2.2, 30], [False, '\t', 40], [False, 3.3, ' \t \r \n\t'], ['', 4.4, ' '], ] acc = autom8.Accumulator() matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc) ctx = autom8.create_context(matrix, receiver=acc) autom8.clean_dataset(ctx) assert ctx.matrix.tolist() == [ [True, True, 1.1, True, 20, True], [False, False, 2.2, True, 30, True], [False, True, 0.0, False, 40, True], [False, True, 3.3, True, 0, False], [False, False, 4.4, True, 0, False], ] assert ctx.matrix.formulas == [ 'A', ['is-defined', 'A'], 'B', ['is-defined', 'B'], 'C', ['is-defined', 'C'], ]
def test_creating_simple_matrix_from_list(): acc = autom8.Accumulator() matrix = autom8.create_matrix( [['hi', 1, True], ['bye', 2, False]], receiver=acc, ) c1, c2, c3 = matrix.columns e1 = np.array(['hi', 'bye'], dtype=object) e2 = np.array([1, 2], dtype=None) e3 = np.array([True, False], dtype=None) assert np.array_equal(c1.values, e1) assert np.array_equal(c2.values, e2) assert np.array_equal(c3.values, e3) assert c1.name == 'A' assert c2.name == 'B' assert c3.name == 'C' assert c1.role is None assert c2.role is None assert c3.role is None assert c1.is_original assert c2.is_original assert c3.is_original assert len(acc.warnings) == 0
def test_is_recording_property(): matrix = autom8.create_matrix([[1, 2]]) c1 = autom8.create_context(matrix) c2 = PlaybackContext(matrix, autom8.Accumulator()) assert c1.is_recording assert not c2.is_recording assert hasattr(c1, 'receiver') assert hasattr(c2, 'receiver')
def test_append_column(): matrix = autom8.create_matrix([[1], [2], [3], [4]]) matrix.append_column(np.array([2, 4, 6, 8]), 'foo', 'encoded') c1, c2 = matrix.columns assert c2.name == 'foo' assert c2.role == 'encoded' assert not c2.is_original assert np.array_equal(c2.values, np.array([2, 4, 6, 8])) assert not np.array_equal(c2.values, np.array([1, 2, 3, 4]))
def test_select_columns_by_name_with_superset(): dataset = [ ['hi', 1, True, 10.5], ['so', 2, True, 15.5], ['bye', 3, False, 20.5], ] matrix = autom8.create_matrix(dataset, column_names=['A', 'B', 'C', 'D']) received = matrix.select_columns_by_name(['C', 'B']) assert received.tolist() == [[True, 1], [True, 2], [False, 3]]
def test_columns_with_numbers_with_commas(): dataset = [['A'], ['1,100.0'], ['2,200'], ['3,300'], ['50']] acc = autom8.Accumulator() matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc) ctx = autom8.create_context(matrix, receiver=acc) autom8.clean_dataset(ctx) assert len(acc.warnings) == 0 assert len(ctx.steps) == 1 assert ctx.matrix.tolist() == [[1100], [2200], [3300], [50]]
def test_duplicate_column_names(): acc = autom8.Accumulator() matrix = autom8.create_matrix( dataset=[[1, 2, 3]], column_names=['A', 'B', 'A'], receiver=acc, ) assert len(acc.warnings) == 1 assert 'Column names are not unique' in acc.warnings[0]
def test_select_columns_by_name_with_invalid_arguments(): dataset = [ ['hi', 1, True, 10.5], ['so', 2, True, 15.5], ['bye', 3, False, 20.5], ] matrix = autom8.create_matrix(dataset, column_names=['A', 'B', 'C', 'D']) with pytest.raises(autom8.Autom8Exception) as excinfo: matrix.select_columns_by_name(['C', 'Z']) excinfo.match('Expected column names')
def test_planner_decorator(): matrix = autom8.create_matrix([[1, 1], [2, 2]]) c1 = autom8.create_context(matrix) c2 = PlaybackContext(matrix, autom8.Accumulator()) # This should not raise an exception. autom8.drop_duplicate_columns(c1) # But this should raise one. with pytest.raises(autom8.Autom8Exception) as excinfo: autom8.drop_duplicate_columns(c2) excinfo.match('Expected.*RecordingContext')
def test_column_dtype_property(): matrix = autom8.create_matrix([ ['hi', 10, 1.1, True, None], ['so', 20, 2.2, True, None], ['bye', 30, 3.3, False, None], ]) c1, c2, c3, c4, c5 = matrix.columns assert c1.dtype == np.dtype('O') assert c2.dtype == np.dtype('int64') assert c3.dtype == np.dtype('float64') assert c4.dtype == np.dtype('bool') assert c5.dtype == np.dtype('O')
def test_primitives_with_object_dtype(): dataset = [ ['A', 'B', 'C'], [True, 1.1, 2], [False, 3.1, 4], [True, 5.1, 6], ] acc = autom8.Accumulator() matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc) for col in matrix.columns: col.values = col.values.astype(object) ctx = autom8.create_context(matrix, receiver=acc) autom8.clean_dataset(ctx) dtypes = [c.dtype for c in ctx.matrix.columns] assert dtypes[0] == bool assert dtypes[1] == float assert dtypes[2] == int vectors = [['A', 'B', 'C'], [1, 2, 3.0], [0, 4, 5.0], [1, False, 6.9]] matrix = autom8.create_matrix(vectors, receiver=acc) out = PlaybackContext(matrix, receiver=acc) playback(ctx.steps, out) assert out.matrix.tolist() == [[True, 2.0, 3], [False, 4.0, 5], [True, 0.0, 6]] dtypes = [c.dtype for c in out.matrix.columns] assert dtypes[0] == bool assert dtypes[1] == float assert dtypes[2] == int vectors = [['A', 'B', 'C'], ['1', '2', None], ['', None, ()]] matrix = autom8.create_matrix(vectors, receiver=acc) out = PlaybackContext(matrix, receiver=acc) playback(ctx.steps, out) # Just use repr to avoid having to fart around with nan. assert repr(out.matrix.tolist()) == ("[[True, 2.0, 0], [False, nan, 0]]")
def test_column_with_all_none(): dataset = [ ['A', 'B', 'C'], [True, None, 2], [False, None, 4], [True, None, 6], ] acc = autom8.Accumulator() matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc) ctx = autom8.create_context(matrix, receiver=acc) autom8.clean_dataset(ctx) assert len(acc.warnings) == 1 assert 'Dropping column' in acc.warnings[0] assert ctx.matrix.tolist() == [[True, 2], [False, 4], [True, 6]] vectors = [['A', 'B', 'C'], [1, 2, 'foo'], [3, 4, 'bar']] matrix = autom8.create_matrix(vectors, receiver=acc) out = PlaybackContext(matrix, receiver=acc) playback(ctx.steps, out) assert out.matrix.tolist() == [[1, 'foo'], [3, 'bar']]
def test_matrix_with_unexpected_value(): dataset = [ ['A', 'B', 'C'], [1, 2, ()], [3, 4, {}], [5, 6, object()], ] acc = autom8.Accumulator() matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc) ctx = autom8.create_context(matrix, receiver=acc) autom8.clean_dataset(ctx) assert len(acc.warnings) == 1 assert 'Dropping column' in acc.warnings[0] assert 'contain booleans, numbers' in acc.warnings[0] assert ctx.matrix.tolist() == [[1, 2], [3, 4], [5, 6]] vectors = [['A', 'B', 'C'], [1, 2, 'foo'], [3, 4, 'bar']] matrix = autom8.create_matrix(vectors, receiver=acc) out = PlaybackContext(matrix, receiver=acc) playback(ctx.steps, out) assert out.matrix.tolist() == [[1, 2], [3, 4]]
def test_creating_a_matrix_with_list_of_roles(): dataset = [ ['hi', 1, True, 10.5], ['so', 2, True, 15.5], ['bye', 3, False, 20.5], ] matrix = autom8.create_matrix( dataset, column_roles=['textual', 'categorical', 'encoded', 'numerical']) assert matrix.columns[0].role == 'textual' assert matrix.columns[1].role == 'categorical' assert matrix.columns[2].role == 'encoded' assert matrix.columns[3].role == 'numerical'
def test_copy_method(): m1 = autom8.create_matrix([ ['hi', 1.1, True], ['so', 2.2, True], ['bye', 3.3, False], ]) m2 = autom8.create_matrix([[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]) n1, n2 = m1.copy(), m2.copy() assert m1 is not n1 assert m2 is not n2 assert len(m1.columns) == len(n1.columns) assert len(m2.columns) == len(n2.columns) for a, b in zip(m1.columns + m2.columns, n1.columns + n2.columns): assert a is not b assert a.values is not b.values assert a.name == b.name assert a.role == b.role assert a.is_original == b.is_original assert np.array_equal(a.values, b.values)
def test_column_of_all_strings_and_none_values(): dataset = [ ['A', 'B'], ['1', 2], ['foo', 4], [None, 0], ] acc = autom8.Accumulator() matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc) ctx = autom8.create_context(matrix, receiver=acc) autom8.clean_dataset(ctx) assert len(acc.warnings) == 0 assert len(ctx.steps) == 1 assert ctx.matrix.tolist() == [['1', 2], ['foo', 4], ['', 0]] vectors = [['A', 'B'], [None, 'bar'], ['baz', None]] matrix = autom8.create_matrix(vectors, receiver=acc) out = PlaybackContext(matrix, receiver=acc) playback(ctx.steps, out) assert out.matrix.tolist() == [['', 'bar'], ['baz', None]]
def test_extra_columns_warning_message(): a1 = autom8.Accumulator() a2 = autom8.Accumulator() m1 = autom8.create_matrix([[1, 2], [1, 2, 3]], receiver=a1) m2 = autom8.create_matrix([[1], [1, 2, 3], [1, 2, 3, 4], [1, 2, 3, 4]], receiver=a2) assert len(m1.columns), 2 assert a1.warnings == [ 'Dropped 1 extra column from dataset.' ' Keeping first 2 columns.' ' To avoid this behavior, ensure that each row in the dataset has' ' the same number of columns.' ] assert len(m2.columns), 1 assert a2.warnings == [ 'Dropped 3 extra columns from dataset.' ' Keeping first 1 column.' ' To avoid this behavior, ensure that each row in the dataset has' ' the same number of columns.' ]
def test_creating_simple_matrix_from_numpy_array(): acc = autom8.Accumulator() matrix = autom8.create_matrix( np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]), receiver=acc, ) c1, c2, c3 = matrix.columns e1 = np.array([1, 4, 7, 10], dtype=object) e2 = np.array([2, 5, 8, 11], dtype=None) e3 = np.array([3, 6, 9, 12], dtype=None) assert np.array_equal(c1.values, e1) assert np.array_equal(c2.values, e2) assert np.array_equal(c3.values, e3)
def test_creating_a_matrix_with_map_of_roles(): dataset = [ ['hi', 1, True, 10.5], ['so', 2, True, 15.5], ['bye', 3, False, 20.5], ] matrix = autom8.create_matrix( dataset, column_names=['A', 'B', 'C', 'D'], column_roles={ 'D': 'numerical', 'C': 'encoded', 1: 'categorical', 'A': 'textual', }, ) assert matrix.columns[0].role == 'textual' assert matrix.columns[1].role == 'categorical' assert matrix.columns[2].role == 'encoded' assert matrix.columns[3].role == 'numerical' with pytest.raises(autom8.Autom8Exception) as excinfo: autom8.create_matrix( dataset, column_names=['A', 'B', 'C', 'D'], column_roles={'Z': 'numerical'}, ) excinfo.match('Expected column') with pytest.raises(autom8.Autom8Exception) as excinfo: autom8.create_matrix( dataset, column_names=['A', 'B', 'C', 'D'], column_roles={object(): 'numerical'}, ) excinfo.match('Expected valid column')
def test_empty_dataset_warning_message(): a1 = autom8.Accumulator() a2 = autom8.Accumulator() a3 = autom8.Accumulator() autom8.create_matrix([[]], receiver=a1) autom8.create_matrix([[], []], receiver=a2) autom8.create_matrix([[], [], []], receiver=a3) assert a1.warnings == ['Dropped 1 empty row from dataset.'] assert a2.warnings == ['Dropped 2 empty rows from dataset.'] assert a3.warnings == ['Dropped 3 empty rows from dataset.']
def test_column_of_all_strings(): dataset = [ ['A', 'B'], ['1', 2], ['3', 4], ['n', 0], ] acc = autom8.Accumulator() matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc) ctx = autom8.create_context(matrix, receiver=acc) autom8.clean_dataset(ctx) assert len(acc.warnings) == 0 assert len(ctx.steps) == 0 assert ctx.matrix.tolist() == [['1', 2], ['3', 4], ['n', 0]]