Exemplo n.º 1
0
    def test_order(self):
        """Testing production of chunks in a different order from the stream."""
        data_sizes = [200, 545]
        data_attrs_numbers = [5, 8, 2, 1, 15]
        inp_chunk_sizes = [1, 2, 3, 4, 5]
        buffer_sizes = [2, 38, 1000]

        for data_size, data_attrs_number, buffer_size, input_chunk_size in \
                itertools.product(data_sizes, data_attrs_numbers, buffer_sizes,
                                  inp_chunk_sizes):
            data = generate_data_chunk(data_attrs_number, data_size)
            inp_data_chunks = create_list_of_data_chunks(
                data, input_chunk_size)

            chunk_collector = ChunkShuffler(buffer_size=buffer_size)
            accum = ChunkAccumulator(collector=chunk_collector)

            actual_chunks = []

            for actual_chunk in accum.iter(inp_data_chunks):
                actual_chunks.append(actual_chunk)
            actual_ds = concat_chunks(*actual_chunks)

            self.assertTrue(data != actual_ds)
            self.assertTrue(len(data) == len(actual_ds))
    def test_chunk_size_adjustment_with_random_data_and_params(self):
        """Default scenario when only the size of chunks is adjusted."""
        data_sizes = [100, 102, 54, 35]
        data_attrs_numbers = [5, 8, 2, 1, 15]
        inp_chunk_sizes = [10, 15, 63, 1, 2]
        batch_sizes = [1, 2, 38, 1000]

        for data_size, data_attrs_number, batch_size, input_chunk_size in \
                itertools.product(data_sizes, data_attrs_numbers, batch_sizes,
                                  inp_chunk_sizes):
            data = generate_data_chunk(data_attrs_number, data_size)
            expected_batches = create_list_of_data_chunks(data, batch_size)
            inp_data_chunks = create_list_of_data_chunks(data, input_chunk_size)

            chunk_collector = UnitCollector(max_size=batch_size)
            batcher = ChunkAccumulator(collector=chunk_collector)

            indx = 0
            for actual_chunk in batcher.iter(inp_data_chunks):
                expected_batch = expected_batches[indx]
                self.assertTrue(actual_chunk == expected_batch)
                indx += 1
            self.assertEqual(len(expected_batches), indx)
Exemplo n.º 3
0
    def test_output(self):
        """Checking if read data-chunks are valid."""
        data_path = 'mltoolkit/mldp/tests/data/small_chunks/chunk2.csv'
        chunk_size = 2

        reader = CsvReader(chunk_size=chunk_size, worker_threads_num=1, sep=',',
                           encoding='utf-8', use_lists=False)

        data = read_data_from_csv_file(data_path, encoding='utf-8')
        expected_chunks = create_list_of_data_chunks(data,
                                                     chunk_size=chunk_size)

        itr = reader.iter(data_path=data_path)
        i = 0
        for (actual_chunk, expected_chunk) in zip(itr, expected_chunks):
            self.assertTrue(actual_chunk == expected_chunk)
            i += 1

        self.assertTrue(i == len(expected_chunks) and len(expected_chunks) > 0)
Exemplo n.º 4
0
    def test_output(self):
        data_size = 1234
        data_attrs_number = 15
        input_chunks_size = 10
        transform_attrs_number = 10

        functions = [lambda x: np.log(abs(x) + 1), lambda x: np.exp(x),
                     lambda x: x**2]
        data = generate_data_chunk(data_attrs_number, data_size)
        transform_attrs = list(data.keys())[:transform_attrs_number]
        input_data_chunks = create_list_of_data_chunks(data, input_chunks_size)

        for func in functions:
            function_applier = FunctionApplier({a:func for a in transform_attrs})
            for input_data_chunk in input_data_chunks:
                actual_chunk = function_applier(input_data_chunk)
                expected_chunk = input_data_chunk

                # transforming manually values of input data-chunks
                for transform_attr in transform_attrs:
                    expected_chunk[transform_attr] = \
                        func(expected_chunk[transform_attr])

                self.assertTrue(actual_chunk == expected_chunk)