def test_order(self): """Testing production of chunks in a different order from the stream.""" data_sizes = [200, 545] data_attrs_numbers = [5, 8, 2, 1, 15] inp_chunk_sizes = [1, 2, 3, 4, 5] buffer_sizes = [2, 38, 1000] for data_size, data_attrs_number, buffer_size, input_chunk_size in \ itertools.product(data_sizes, data_attrs_numbers, buffer_sizes, inp_chunk_sizes): data = generate_data_chunk(data_attrs_number, data_size) inp_data_chunks = create_list_of_data_chunks( data, input_chunk_size) chunk_collector = ChunkShuffler(buffer_size=buffer_size) accum = ChunkAccumulator(collector=chunk_collector) actual_chunks = [] for actual_chunk in accum.iter(inp_data_chunks): actual_chunks.append(actual_chunk) actual_ds = concat_chunks(*actual_chunks) self.assertTrue(data != actual_ds) self.assertTrue(len(data) == len(actual_ds))
def test_chunk_size_adjustment_with_random_data_and_params(self): """Default scenario when only the size of chunks is adjusted.""" data_sizes = [100, 102, 54, 35] data_attrs_numbers = [5, 8, 2, 1, 15] inp_chunk_sizes = [10, 15, 63, 1, 2] batch_sizes = [1, 2, 38, 1000] for data_size, data_attrs_number, batch_size, input_chunk_size in \ itertools.product(data_sizes, data_attrs_numbers, batch_sizes, inp_chunk_sizes): data = generate_data_chunk(data_attrs_number, data_size) expected_batches = create_list_of_data_chunks(data, batch_size) inp_data_chunks = create_list_of_data_chunks(data, input_chunk_size) chunk_collector = UnitCollector(max_size=batch_size) batcher = ChunkAccumulator(collector=chunk_collector) indx = 0 for actual_chunk in batcher.iter(inp_data_chunks): expected_batch = expected_batches[indx] self.assertTrue(actual_chunk == expected_batch) indx += 1 self.assertEqual(len(expected_batches), indx)
def test_output(self): """Checking if read data-chunks are valid.""" data_path = 'mltoolkit/mldp/tests/data/small_chunks/chunk2.csv' chunk_size = 2 reader = CsvReader(chunk_size=chunk_size, worker_threads_num=1, sep=',', encoding='utf-8', use_lists=False) data = read_data_from_csv_file(data_path, encoding='utf-8') expected_chunks = create_list_of_data_chunks(data, chunk_size=chunk_size) itr = reader.iter(data_path=data_path) i = 0 for (actual_chunk, expected_chunk) in zip(itr, expected_chunks): self.assertTrue(actual_chunk == expected_chunk) i += 1 self.assertTrue(i == len(expected_chunks) and len(expected_chunks) > 0)
def test_output(self): data_size = 1234 data_attrs_number = 15 input_chunks_size = 10 transform_attrs_number = 10 functions = [lambda x: np.log(abs(x) + 1), lambda x: np.exp(x), lambda x: x**2] data = generate_data_chunk(data_attrs_number, data_size) transform_attrs = list(data.keys())[:transform_attrs_number] input_data_chunks = create_list_of_data_chunks(data, input_chunks_size) for func in functions: function_applier = FunctionApplier({a:func for a in transform_attrs}) for input_data_chunk in input_data_chunks: actual_chunk = function_applier(input_data_chunk) expected_chunk = input_data_chunk # transforming manually values of input data-chunks for transform_attr in transform_attrs: expected_chunk[transform_attr] = \ func(expected_chunk[transform_attr]) self.assertTrue(actual_chunk == expected_chunk)