def test_column_with_some_blank_strings(): # Repeat the previous test, only replace most of the empty strings with # blank strings. dataset = [ ['A', 'B', 'C'], [True, 1.1, 20], [' ', 2.2, 30], [False, '\t', 40], [False, 3.3, ' \t \r \n\t'], ['', 4.4, ' '], ] acc = autom8.Accumulator() matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc) ctx = autom8.create_context(matrix, receiver=acc) autom8.clean_dataset(ctx) assert ctx.matrix.tolist() == [ [True, True, 1.1, True, 20, True], [False, False, 2.2, True, 30, True], [False, True, 0.0, False, 40, True], [False, True, 3.3, True, 0, False], [False, False, 4.4, True, 0, False], ] assert ctx.matrix.formulas == [ 'A', ['is-defined', 'A'], 'B', ['is-defined', 'B'], 'C', ['is-defined', 'C'], ]
def test_column_of_ints_and_floats(): dataset = [ ['A', 'B'], [1, 3.3], [2.2, 4], [None, None], ] acc = autom8.Accumulator() matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc) ctx = autom8.create_context(matrix, receiver=acc) autom8.clean_dataset(ctx) assert len(ctx.steps) == 4 assert len(acc.warnings) == 2 assert ctx.matrix.tolist() == [ [1.0, True, 3.3, True], [2.2, True, 4.0, True], [0.0, False, 0.0, False], ] vectors = [['A', 'B'], [None, 10], [20.0, None], [30, 40]] matrix = autom8.create_matrix(vectors, receiver=acc) out = PlaybackContext(matrix, receiver=acc) playback(ctx.steps, out) assert out.matrix.tolist() == [ [0.0, False, 10.0, True], [20.0, True, 0.0, False], [30.0, True, 40.0, True], ] assert out.matrix.columns[0].dtype == float assert out.matrix.columns[2].dtype == float
def test_columns_with_numbers_as_strings(): dataset = [ ['A', 'B', 'C'], ['1.1', '$4', 7], ['2.2', '$5', 8], ['3.3', '6%', 9], ] acc = autom8.Accumulator() matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc) ctx = autom8.create_context(matrix, receiver=acc) autom8.clean_dataset(ctx) assert len(acc.warnings) == 0 assert len(ctx.steps) == 2 assert ctx.matrix.tolist() == [[1.1, 4, 7], [2.2, 5, 8], [3.3, 6, 9]] vectors = [['A', 'B', 'C'], [1, '2%', 'foo'], ['3', 4.0, 'bar']] matrix = autom8.create_matrix(vectors, receiver=acc) out = PlaybackContext(matrix, receiver=acc) playback(ctx.steps, out) assert out.matrix.tolist() == [[1, 2, 'foo'], [3, 4, 'bar']] assert out.matrix.columns[0].dtype == int assert out.matrix.columns[1].dtype == float
def test_is_recording_property(): matrix = autom8.create_matrix([[1, 2]]) c1 = autom8.create_context(matrix) c2 = PlaybackContext(matrix, autom8.Accumulator()) assert c1.is_recording assert not c2.is_recording assert hasattr(c1, 'receiver') assert hasattr(c2, 'receiver')
def test_columns_with_numbers_with_commas(): dataset = [['A'], ['1,100.0'], ['2,200'], ['3,300'], ['50']] acc = autom8.Accumulator() matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc) ctx = autom8.create_context(matrix, receiver=acc) autom8.clean_dataset(ctx) assert len(acc.warnings) == 0 assert len(ctx.steps) == 1 assert ctx.matrix.tolist() == [[1100], [2200], [3300], [50]]
def _create_context(features, roles): # Add a column of labels. It's required by create_context. dataset = [row + [0] for row in features] num_cols = len(dataset[0]) return autom8.create_context( dataset=dataset, column_names=_column_names(dataset), column_roles=roles + ['numerical'], )
def test_planner_decorator(): matrix = autom8.create_matrix([[1, 1], [2, 2]]) c1 = autom8.create_context(matrix) c2 = PlaybackContext(matrix, autom8.Accumulator()) # This should not raise an exception. autom8.drop_duplicate_columns(c1) # But this should raise one. with pytest.raises(autom8.Autom8Exception) as excinfo: autom8.drop_duplicate_columns(c2) excinfo.match('Expected.*RecordingContext')
def test_mixed_up_columns_with_strings_and_numbers(): dataset = [ ['A', 'B'], [True, 'foo'], [1.1, 30], [20, 4.4], ['bar', False], ['', 'baz'], [50, 'fiz'], [None, True], ] acc = autom8.Accumulator() matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc) ctx = autom8.create_context(matrix, receiver=acc) autom8.clean_dataset(ctx) assert len(ctx.steps) == 6 assert len(acc.warnings) == 0 assert ctx.matrix.tolist() == [ [1.0, '', 0.0, 'foo'], [1.1, '', 30.0, ''], [20.0, '', 4.4, ''], [0.0, 'bar', 0.0, ''], [0.0, '', 0.0, 'baz'], [50.0, '', 0.0, 'fiz'], [0.0, '', 1.0, ''], ] assert ctx.matrix.formulas == [ ['number', 'A'], ['string', 'A'], ['number', 'B'], ['string', 'B'], ] vectors = [['A', 'B'], [False, 'buz'], ['zim', 10], [2, None]] matrix = autom8.create_matrix(vectors, receiver=acc) out = PlaybackContext(matrix, receiver=acc) playback(ctx.steps, out) assert out.matrix.tolist() == [ [0.0, '', 0.0, 'buz'], [0.0, 'zim', 10.0, ''], [2.0, '', 0.0, ''], ] assert out.matrix.formulas == [ ['number', 'A'], ['string', 'A'], ['number', 'B'], ['string', 'B'], ]
def test_clean_numeric_labels(): dataset = [ ['A', 'B', 'C'], [1, 2, '3'], [3, 4, '4'], [5, 6, 5], [7, 8, None], [9, 9, ''], ] acc = autom8.Accumulator() ctx = autom8.create_context(dataset, receiver=acc) assert len(acc.warnings) == 1 assert ctx.labels.original.tolist() == [3, 4, 5, 0, 0]
def test_columns_with_some_empty_strings(): dataset = [ ['A', 'B', 'C'], [True, 1.1, 20], ['', 2.2, 30], [False, '', 40], [False, 3.3, ''], ['', 4.4, ''], ] acc = autom8.Accumulator() matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc) ctx = autom8.create_context(matrix, receiver=acc) autom8.clean_dataset(ctx) assert len(ctx.steps) == 6 assert len(acc.warnings) == 3 assert ctx.matrix.tolist() == [ [True, True, 1.1, True, 20, True], [False, False, 2.2, True, 30, True], [False, True, 0.0, False, 40, True], [False, True, 3.3, True, 0, False], [False, False, 4.4, True, 0, False], ] assert ctx.matrix.formulas == [ 'A', ['is-defined', 'A'], 'B', ['is-defined', 'B'], 'C', ['is-defined', 'C'], ] vectors = [['A', 'B', 'C'], ['', 5.5, ''], [True, '', 50]] matrix = autom8.create_matrix(vectors, receiver=acc) out = PlaybackContext(matrix, receiver=acc) playback(ctx.steps, out) assert out.matrix.tolist() == [ [False, False, 5.5, True, 0, False], [True, True, 0.0, False, 50, True], ] assert out.matrix.formulas == [ 'A', ['is-defined', 'A'], 'B', ['is-defined', 'B'], 'C', ['is-defined', 'C'], ]
def test_column_of_all_strings(): dataset = [ ['A', 'B'], ['1', 2], ['3', 4], ['n', 0], ] acc = autom8.Accumulator() matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc) ctx = autom8.create_context(matrix, receiver=acc) autom8.clean_dataset(ctx) assert len(acc.warnings) == 0 assert len(ctx.steps) == 0 assert ctx.matrix.tolist() == [['1', 2], ['3', 4], ['n', 0]]
def test_evaluate_pipeline(): acc = autom8.Accumulator() inputs = [ [1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16], ] dataset = [i + [i[0] + i[1]] for i in inputs] ctx = autom8.create_context(dataset, receiver=acc) # For now, just hack in the test_indices that we want. ctx.test_indices = [2, 5] autom8.add_column_of_ones(ctx) ctx << sklearn.linear_model.LinearRegression() assert len(acc.candidates) == 1 candidate = acc.candidates[0] assert candidate.train.metrics['r2_score'] == 1.0 assert candidate.test.metrics['r2_score'] == 1.0 assert np.allclose( candidate.train.predictions, np.array([1 + 2, 3 + 4, 7 + 8, 9 + 10, 13 + 14, 15 + 16]), ) assert np.allclose( candidate.test.predictions, np.array([5 + 6, 11 + 12]), ) # Try using the pipeline to make some predictions. result = candidate.pipeline.run([[17, 18], [19, 20], [21, 22]], receiver=acc) assert np.allclose(result.predictions, np.array([17 + 18, 19 + 20, 21 + 22])) assert result.probabilities is None assert not acc.warnings
def test_sandbox(): ctx = autom8.create_context( dataset=[ [1, 5, True, 9, 10], [2, 6, False, 10, 20], [3, 7, False, 11, 30], [4, 8, True, 12, 40], ], column_names=['A', 'B', 'C', 'D', 'E'], column_roles=['numerical'] * 2 + ['encoded'] + ['numerical'] * 2, ) autom8.add_column_of_ones(ctx) assert len(ctx.steps) == 1 assert len(ctx.matrix.columns) == 4+1 assert ctx.matrix.tolist() == [ [1, 5, True, 9, 1], [2, 6, False, 10, 1], [3, 7, False, 11, 1], [4, 8, True, 12, 1], ] with ctx.sandbox(): autom8.multiply_columns(ctx) assert len(ctx.steps) == 2 assert len(ctx.matrix.columns) == 4+1+3 assert ctx.matrix.tolist() == [ [1, 5, True, 9, 1, 1*5, 1*9, 5*9], [2, 6, False, 10, 1, 2*6, 2*10, 6*10], [3, 7, False, 11, 1, 3*7, 3*11, 7*11], [4, 8, True, 12, 1, 4*8, 4*12, 8*12], ] # Now check that the context has been restored to its previous state. assert len(ctx.steps) == 1 assert len(ctx.matrix.columns) == 4+1 assert ctx.matrix.tolist() == [ [1, 5, True, 9, 1], [2, 6, False, 10, 1], [3, 7, False, 11, 1], [4, 8, True, 12, 1], ]
def test_primitives_with_object_dtype(): dataset = [ ['A', 'B', 'C'], [True, 1.1, 2], [False, 3.1, 4], [True, 5.1, 6], ] acc = autom8.Accumulator() matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc) for col in matrix.columns: col.values = col.values.astype(object) ctx = autom8.create_context(matrix, receiver=acc) autom8.clean_dataset(ctx) dtypes = [c.dtype for c in ctx.matrix.columns] assert dtypes[0] == bool assert dtypes[1] == float assert dtypes[2] == int vectors = [['A', 'B', 'C'], [1, 2, 3.0], [0, 4, 5.0], [1, False, 6.9]] matrix = autom8.create_matrix(vectors, receiver=acc) out = PlaybackContext(matrix, receiver=acc) playback(ctx.steps, out) assert out.matrix.tolist() == [[True, 2.0, 3], [False, 4.0, 5], [True, 0.0, 6]] dtypes = [c.dtype for c in out.matrix.columns] assert dtypes[0] == bool assert dtypes[1] == float assert dtypes[2] == int vectors = [['A', 'B', 'C'], ['1', '2', None], ['', None, ()]] matrix = autom8.create_matrix(vectors, receiver=acc) out = PlaybackContext(matrix, receiver=acc) playback(ctx.steps, out) # Just use repr to avoid having to fart around with nan. assert repr(out.matrix.tolist()) == ("[[True, 2.0, 0], [False, nan, 0]]")
def test_column_with_all_none(): dataset = [ ['A', 'B', 'C'], [True, None, 2], [False, None, 4], [True, None, 6], ] acc = autom8.Accumulator() matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc) ctx = autom8.create_context(matrix, receiver=acc) autom8.clean_dataset(ctx) assert len(acc.warnings) == 1 assert 'Dropping column' in acc.warnings[0] assert ctx.matrix.tolist() == [[True, 2], [False, 4], [True, 6]] vectors = [['A', 'B', 'C'], [1, 2, 'foo'], [3, 4, 'bar']] matrix = autom8.create_matrix(vectors, receiver=acc) out = PlaybackContext(matrix, receiver=acc) playback(ctx.steps, out) assert out.matrix.tolist() == [[1, 'foo'], [3, 'bar']]
def test_matrix_with_unexpected_value(): dataset = [ ['A', 'B', 'C'], [1, 2, ()], [3, 4, {}], [5, 6, object()], ] acc = autom8.Accumulator() matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc) ctx = autom8.create_context(matrix, receiver=acc) autom8.clean_dataset(ctx) assert len(acc.warnings) == 1 assert 'Dropping column' in acc.warnings[0] assert 'contain booleans, numbers' in acc.warnings[0] assert ctx.matrix.tolist() == [[1, 2], [3, 4], [5, 6]] vectors = [['A', 'B', 'C'], [1, 2, 'foo'], [3, 4, 'bar']] matrix = autom8.create_matrix(vectors, receiver=acc) out = PlaybackContext(matrix, receiver=acc) playback(ctx.steps, out) assert out.matrix.tolist() == [[1, 2], [3, 4]]
def test_column_of_all_strings_and_none_values(): dataset = [ ['A', 'B'], ['1', 2], ['foo', 4], [None, 0], ] acc = autom8.Accumulator() matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc) ctx = autom8.create_context(matrix, receiver=acc) autom8.clean_dataset(ctx) assert len(acc.warnings) == 0 assert len(ctx.steps) == 1 assert ctx.matrix.tolist() == [['1', 2], ['foo', 4], ['', 0]] vectors = [['A', 'B'], [None, 'bar'], ['baz', None]] matrix = autom8.create_matrix(vectors, receiver=acc) out = PlaybackContext(matrix, receiver=acc) playback(ctx.steps, out) assert out.matrix.tolist() == [['', 'bar'], ['baz', None]]
def test_training_and_testing_data(): dataset = autom8.create_matrix([ [1, 5, True, 9, 10], [2, 6, False, 10, 20], [3, 7, False, 11, 30], [4, 8, True, 12, 40], ]) ctx = autom8.create_context(dataset) # For now, just hack in the test_indices that we want. ctx.test_indices = [1, 3] m1, a1 = ctx.testing_data() m2, a2 = ctx.training_data() assert a1.tolist() == [20, 40] assert a2.tolist() == [10, 30] assert m1.tolist() == [ [2, 6, False, 10], [4, 8, True, 12], ] assert m2.tolist() == [ [1, 5, True, 9], [3, 7, False, 11], ]
def test_invalid_contexts(): with pytest.raises(autom8.Autom8Exception) as excinfo: autom8.create_context([]) excinfo.match('Expected.*dataset') with pytest.raises(autom8.Autom8Exception) as excinfo: autom8.create_context([[1], [2], [3]]) excinfo.match('Expected.*dataset') with pytest.raises(autom8.Autom8Exception) as excinfo: autom8.create_context( dataset=[['A', 'B'], [1, 2]], target_column='C', ) excinfo.match('Expected.*target_column') with pytest.raises(autom8.Autom8Exception) as excinfo: autom8.create_context( dataset=[['A', 'B'], [1, 2]], target_column=object(), ) excinfo.match('Expected.*target_column') with pytest.raises(autom8.Autom8Exception) as excinfo: autom8.create_context( dataset=[['A', 'B'], [1, 2]], target_column=10, ) excinfo.match('Expected.*target_column') with pytest.raises(autom8.Autom8Exception) as excinfo: autom8.create_context( dataset=[['A', 'B'], [1, 2]], problem_type='classify', ) excinfo.match('Expected.*problem_type') with pytest.raises(autom8.Autom8Exception) as excinfo: autom8.create_context( dataset=[['A', 'B'], [1, 2]], test_ratio=1.2, ) excinfo.match('Expected.*test_ratio')