def test_can_define_column_configuration(self): schema = Schema() schema.define_column('A', type='int', config={'a': 10}) self.assertEqual(('A', ), schema.columns) self.assertEqual('A', schema.producers[0].name) self.assertEqual('int', schema.producers[0].type) self.assertEqual({'a': 10}, schema.producers[0].config)
def test_can_obtain_a_column_type(self): schema = Schema() schema.define_column('A', type='int') self.assertEqual(('A', ), schema.columns) self.assertEqual('A', schema.producers[0].name) self.assertEqual('int', schema.producers[0].type) self.assertEqual({}, schema.producers[0].config)
def test_can_build_a_generator_from_a_schema_with_config(self): schema = Schema() schema.define_column('A', type='int', config={'min': 10}) engine = Engine(schema, self.library) self.assertEqual(1, engine.number_of_columns) values = set(engine.generate_data(20)) expected_values = {(self.rand_copy.randint(10, 1_000_000),) for _ in range(20)}
def test_can_mix_reference_and_auto_generated_producers(self): schema = Schema() schema.add_producer('my_producer', type='int') schema.define_column('A', producer='my_producer') schema.define_column('B', type='int') producers = sorted(schema.producers, key=lambda x: x.name) self.assertEqual(2, len(producers)) self.assertEqual(SimpleNamespace(name='B', type='int', config={}), producers[0]) self.assertEqual( SimpleNamespace(name='my_producer', type='int', config={}), producers[1])
def test_add_column_does_not_add_a_column_if_it_raises_error(self): schema = Schema() schema.add_column('A') with self.assertRaises(SchemaError): schema.add_column('A') self.assertEqual(('A', ), schema.columns)
def test_must_specify_producer_or_type(self): schema = Schema() with self.assertRaises(TypeError): schema.define_column('A') with self.assertRaises(TypeError): schema.define_column('A', config={'a': 1})
def test_raises_error_if_register_same_producer_multiple_times(self): schema = Schema() schema.define_column('A', type='int') with self.assertRaises(SchemaError) as ctx: schema.define_column('A', type='int') self.assertEqual("Column 'A' is already defined.", str(ctx.exception))
def test_add_column_raises_error_if_column_is_already_defined(self): schema = Schema() schema.add_column('A') with self.assertRaises(SchemaError) as ctx: schema.add_column('A') self.assertEqual("Column 'A' is already defined.", str(ctx.exception))
def test_raises_error_if_register_same_transformer_multiple_times(self): schema = Schema() schema.define_column('A', type='int') ret_none = FunctionalTransformer(lambda x: None) schema.add_transformer('my_transformer', inputs=['A'], outputs=['A'], transformer=ret_none) with self.assertRaises(SchemaError) as ctx: schema.add_transformer('my_transformer', inputs=['A'], outputs=['A'], transformer=ret_none) self.assertEqual("Transformer 'my_transformer' is already defined.", str(ctx.exception))
def test_can_generate_two_identical_columns_by_referencing_name_of_auto_created_producer(self): schema = Schema() schema.define_column('A', type='int') schema.define_column('B', producer='A') engine = Engine(schema, self.library) generated_values = list(engine.generate_data(number_of_rows=10)) first_col, second_col = zip(*generated_values) self.assertEqual(first_col, second_col)
def test_can_generate_some_data_no_header(self): schema = Schema(show_header=False) schema.define_column('A', type='int') schema.define_column('B', type='int') schema.define_column('C', type='int') saved_data = StringIO() generate_data(schema, self.library, saved_data, number_of_rows=1) lines = saved_data.getvalue() expected_values = tuple(self.rand_copy.randint(0, 1_000_000) for _ in range(3))
def test_can_generate_some_data_bytecount(self): schema = Schema() schema.define_column('A', type='int') schema.define_column('B', type='int') schema.define_column('C', type='int') saved_data = StringIO() generate_data(schema, self.library, saved_data, byte_count=128) lines = saved_data.getvalue() expected_values = [','.join(map(str, (self.rand_copy.randint(0, 1_000_000) for _ in range(3)))) for _ in range(6)]
def test_can_generate_producer_data_with_number_of_rows(self): schema = Schema() schema.define_column('A', type='int') schema.define_column('B', type='int') schema.define_column('C', type='int') engine = Engine(schema, self.library) generated_values = list(engine.generate_data(number_of_rows=10)) self.assertEqual(10, len(generated_values)) iterable = (self.rand_copy.randint(0, 1_000_000) for _ in range(30))
def test_can_generate_stream_of_data(self): schema = Schema() schema.define_column('A', type='int') schema.define_column('B', type='int') schema.define_column('C', type='int') engine = Engine(schema, self.library) generated_values = list(it.islice(engine.generate_data(), 1000)) self.assertEqual(1000, len(generated_values)) iterable = (self.rand_copy.randint(0, 1_000_000) for _ in range(3000))
def test_can_create_columns_with_same_producer(self): schema = Schema() schema.add_producer('my_producer', type='int') schema.define_column('A', producer='my_producer') schema.define_column('B', producer='my_producer') self.assertEqual(('A', 'B'), schema.columns) self.assertEqual(1, len(schema.producers)) self.assertEqual('my_producer', schema.producers[0].name) self.assertEqual('int', schema.producers[0].type) self.assertEqual({}, schema.producers[0].config)
def test_can_generate_some_data_stream(self): schema = Schema() schema.define_column('A', type='int') schema.define_column('B', type='int') schema.define_column('C', type='int') saved_data = MaxSizeFileIO(256) with self.assertRaises(IOError): generate_data(schema, self.library, saved_data, stream_mode=True) lines = saved_data.buffer expected_values = [','.join(map(str, (self.rand_copy.randint(0, 1_000_000) for _ in range(3)))) for _ in range(12)]
def test_can_build_a_generator_from_a_schema(self): schema = Schema() schema.define_column('A', type='int') schema.define_column('B', type='int') schema.define_column('C', type='int') engine = Engine(schema, self.library) self.assertEqual(3, engine.number_of_columns) values = list(engine.generate_data(1)) self.assertEqual(1, len(values)) expected_values = tuple(self.rand_copy.randint(0, 1_000_000) for _ in range(3))
def test_raises_an_error_if_inputs_do_not_exist(self): schema = Schema() schema.define_column('A', type='int') ret_none = FunctionalTransformer(lambda x: None) with self.assertRaises(SchemaError) as ctx: schema.add_transformer('my_transformer', inputs=['B'], outputs=['A'], transformer=ret_none) self.assertEqual("Inputs: 'B' are not defined in the schema.", str(ctx.exception))
def test_raises_an_error_if_double_output_name(self): schema = Schema() schema.define_column('A', type='int') schema.define_column('B', type='int') ret_none = FunctionalTransformer(lambda x: None, num_outputs=2) with self.assertRaises(SchemaError) as ctx: schema.add_transformer('my_transformer', inputs=['A'], outputs=['A', 'A'], transformer=ret_none) self.assertEqual("Outputs must be unique. Got multiple 'A' outputs.", str(ctx.exception))
def test_raises_an_error_if_num_outputs_do_not_match_arity(self): schema = Schema() schema.define_column('A', type='int') schema.define_column('B', type='int') ret_none = FunctionalTransformer(lambda x: None) with self.assertRaises(SchemaError) as ctx: schema.add_transformer('my_transformer', inputs=['A'], outputs=['A', 'B'], transformer=ret_none) self.assertEqual( "Got 2 outputs: 'A', 'B' but transformer's number of outputs is 1.", str(ctx.exception))
def test_can_add_a_transformer(self): schema = Schema() schema.define_column('A', type='int') add_one = FunctionalTransformer(lambda x: x + 1) schema.add_transformer('my_transformer', inputs=['A'], outputs=['A'], transformer=add_one) self.assertEqual(1, len(schema.transformers)) self.assertEqual( SimpleNamespace(name='my_transformer', inputs=['A'], outputs=['A'], transformer=add_one), schema.transformers[0])
def test_str(self): schema = Schema() schema.add_producer('my_producer', type='int') schema.define_column('A', producer='my_producer') schema.define_column('B', producer='my_producer') str_regex = re.compile( r''' Schema\( \s*columns=\[[^]]+\], \s*producers=\{my_producer:\s*\{'type':\s'int',\s'config':\s\{\}\}\}, \s*transformers=(\{'name':\s'(A|B)',\s*'transformer':\s<feanor\.schema\.ProjectionTransformer\sobject\sat\s\w+>,\s*'inputs':\s\[[^]]+\],\s'outputs':\s\[[^]]+\]\},?\s*)+ show_header=True\s* \) ''', re.VERBOSE) self.assertRegex(str(schema), str_regex)
def test_can_create_column_by_referencing_producer(self): schema = Schema() schema.add_producer('my_producer', type='int') schema.define_column('A', producer='my_producer') self.assertEqual(('A', ), schema.columns) self.assertEqual(1, len(schema.producers)) self.assertEqual('my_producer', schema.producers[0].name) self.assertEqual('int', schema.producers[0].type) self.assertEqual({}, schema.producers[0].config) self.assertEqual(1, len(schema.transformers)) self.assertEqual('A', schema.transformers[0].name) self.assertEqual(['my_producer'], schema.transformers[0].inputs) self.assertEqual(['A'], schema.transformers[0].outputs) self.assertEqual(ProjectionTransformer(1, 0), schema.transformers[0].transformer)
def test_can_repeat_input_name_of_transformer(self): schema = Schema() schema.define_column('A', type='int') schema.define_column('B', type='int') ret_none = FunctionalTransformer(lambda x, y: x + y) schema.add_transformer('my_transformer', inputs=['A', 'A'], outputs=['A'], transformer=ret_none) self.assertEqual(len(schema.transformers), 1) self.assertEqual( SimpleNamespace(name='my_transformer', inputs=['A', 'A'], outputs=['A'], transformer=ret_none), schema.transformers[0])
def test_creates_different_producers_when_multiple_columns(self): schema = Schema() schema.define_column('A', type='int') schema.define_column('B', type='int') schema.define_column('C', type='int') self.assertEqual(('A', 'B', 'C'), schema.columns) self.assertEqual(3, len(schema.producers)) producers = sorted(schema.producers, key=lambda x: x.name) self.assertEqual('A', producers[0].name) self.assertEqual('int', producers[0].type) self.assertEqual({}, producers[0].config) self.assertEqual('B', producers[1].name) self.assertEqual('int', producers[1].type) self.assertEqual({}, producers[1].config) self.assertEqual('C', producers[2].name) self.assertEqual('int', producers[2].type) self.assertEqual({}, producers[2].config)
def test_can_use_transformer_to_filter_value(self): schema = Schema() schema.define_column('A', type='int') def test_transformer(unused): return None ret_none = FunctionalTransformer(test_transformer) schema.add_transformer('my_transformer', inputs=['A'], outputs=['A'], transformer=ret_none) self.assertEqual(1, len(schema.transformers)) self.assertEqual( SimpleNamespace(name='my_transformer', inputs=['A'], outputs=['A'], transformer=ret_none), schema.transformers[0])
def test_generate_data_raises_if_missing_size_parameters(self): with self.assertRaises(TypeError): generate_data(Schema(), self.library, mock.MagicMock())
def test_can_add_columns_to_a_schema(self): schema = Schema() schema.add_column('A') schema.add_column('B') schema.add_column('C') self.assertEqual(('A', 'B', 'C'), schema.columns)
def test_generate_data_raises_if_both_num_rows_and_num_bytes_are_specified(self): with self.assertRaises(TypeError): generate_data(Schema(), self.library, mock.MagicMock(), number_of_rows=10, byte_count=100)
def test_can_specify_header_visibility(self): schema = Schema(show_header=False) self.assertFalse(schema.show_header)