Exemplo n.º 1
0
 def _calc_mean(self, axis=None):
     if axis is None:
         result = self._new(input=self.dag.transform(
             lambda x: (x.shape[0] * x.shape[1], np.sum(x, axis=axis)),
             [self.input],
         ))._compute()
         total_count = builtins.sum([res[0] for res in result])
         mean = np.sum([res[1] for res in result], axis=axis) / total_count
         return mean
     elif axis == 0:  # mean of each column
         result = self._new(input=self.dag.transform(
             lambda x: (x.shape[0], np.sum(x, axis=axis)),
             [self.input]))._compute()
         total_count = builtins.sum([res[0] for res in result])
         mean = np.sum([res[1] for res in result], axis=axis) / total_count
         # new dag
         dag = DAG(self.executor)
         partitioned_input = [mean]
         input = dag.add_input(partitioned_input)
         return self._new(
             dag=dag,
             input=input,
             shape=mean.shape,
             chunks=mean.shape,
             partition_row_counts=mean.shape,
         )
     return NotImplemented
Exemplo n.º 2
0
    def _repartition_chunks(self, chunks):
        c = chunks[0]
        partition_row_ranges, total_rows, new_num_partitions = calculate_partition_boundaries(
            chunks, self.partition_row_counts)

        # make a new zarr group in the intermediate store with a unique name
        root = self.intermediate_group.create_group(str(uuid.uuid4()))
        # make a zarr group for each partition
        with concurrent.futures.ThreadPoolExecutor(max_workers=64) as executor:
            executor.map(lambda index: root.create_group(str(index)),
                         range(new_num_partitions))

        def tmp_store(pairs):
            for pair in pairs:
                index, offsets, partial_chunk = pair[0], pair[1][0], pair[1][1]
                g = root.require_group(str(index))
                g.array("%s-%s" % (offsets[0], offsets[1]),
                        partial_chunk,
                        chunks=False)

        x1 = self.dag.add_input(partition_row_ranges)
        x2 = self.dag.transform(
            lambda x, y: extract_partial_chunks((x, y), chunks),
            [x1, self.input])

        x3 = self.dag.transform(tmp_store, [x2])

        # run computation to save partial chunks
        list(self.dag.compute(x3))

        # create a new computation to read and combine partial chunks
        def tmp_load(new_index):
            # last chunk has fewer than c rows
            if new_index == new_num_partitions - 1 and total_rows % c != 0:
                last_chunk_rows = total_rows % c
                arr = np.zeros((last_chunk_rows, chunks[1]))
            else:
                arr = np.zeros(chunks)
            g = root.require_group(str(new_index))
            for (name, partial_chunk) in g.arrays():
                new_start_offset, new_end_offset = [
                    int(n) for n in name.split("-")
                ]
                arr[new_start_offset:new_end_offset] = partial_chunk
            return arr

        dag = DAG(self.executor)
        input = dag.add_input(list(range(new_num_partitions)))
        input = dag.transform(tmp_load, [input])

        # TODO: delete intermediate store when dag is computed
        return ExecutorZappyArray(self.executor, dag, input, self.shape,
                                  chunks, self.dtype)
Exemplo n.º 3
0
 def ones(cls, executor, shape, chunks, dtype=float, intermediate_store=None):
     dag = DAG(executor)
     input = dag.add_input(list(get_chunk_sizes(shape, chunks)))
     input = dag.transform(lambda chunk: np.ones(chunk, dtype=dtype), [input])
     return cls(
         executor,
         dag,
         input,
         shape,
         chunks,
         dtype,
         intermediate_store=intermediate_store,
     )
Exemplo n.º 4
0
 def from_ndarray(cls, executor, arr, chunks, intermediate_store=None):
     func, chunk_indices = ZappyArray._read_chunks(arr, chunks)
     dag = DAG(executor)
     # the input is just the chunk indices
     input = dag.add_input(chunk_indices)
     # add a transform to read chunks
     input = dag.transform(func, [input])
     return cls(
         executor,
         dag,
         input,
         arr.shape,
         chunks,
         arr.dtype,
         intermediate_store=intermediate_store,
     )
Exemplo n.º 5
0
 def _calc_func_axis_distributive(self, func, axis):
     per_chunk_result = [func(x, axis=axis) for x in self._compute()]
     result = func(per_chunk_result, axis=axis)
     if axis is None:
         return result
     elif axis == 0:  # column-wise
         # new dag
         dag = DAG(self.executor)
         partitioned_input = [result]
         input = dag.add_input(partitioned_input)
         return self._new(
             dag=dag,
             input=input,
             shape=result.shape,
             chunks=result.shape,
             partition_row_counts=result.shape,
         )
     return NotImplemented
Exemplo n.º 6
0
def test_no_transform():
    dag = DAG(concurrent.futures.ThreadPoolExecutor())
    output = dag.add_input([2])
    assert list(dag.compute(output)) == [2]
Exemplo n.º 7
0
def test_incompatible_num_partitions():
    dag = DAG(concurrent.futures.ThreadPoolExecutor())
    dag.add_input([2])
    with pytest.raises(AssertionError):
        dag.add_input([1, 5])
Exemplo n.º 8
0
def test_dag_multiple_partitions():
    dag = DAG(concurrent.futures.ThreadPoolExecutor())
    input = dag.add_input([2, 3, 5])
    output = dag.transform(add_one, [input])
    assert list(dag.compute(output)) == [3, 4, 6]
Exemplo n.º 9
0
def test_dag_single_partition_binary_function():
    dag = DAG(concurrent.futures.ThreadPoolExecutor())
    input1 = dag.add_input([2])
    input2 = dag.add_input([3])
    output = dag.transform(add, [input1, input2])
    assert list(dag.compute(output)) == [5]
Exemplo n.º 10
0
def test_dag_single_partition_serial_functions():
    dag = DAG(concurrent.futures.ThreadPoolExecutor())
    input = dag.add_input([2])
    intermediate = dag.transform(add_one, [input])
    output = dag.transform(times_two, [intermediate])
    assert list(dag.compute(output)) == [6]
Exemplo n.º 11
0
def test_dag_multiple_partitions_binary_function():
    dag = DAG(concurrent.futures.ThreadPoolExecutor())
    input1 = dag.add_input([2, 3, 5])
    input2 = dag.add_input([7, 11, 13])
    output = dag.transform(add, [input1, input2])
    assert list(dag.compute(output)) == [9, 14, 18]