예제 #1
0
def test_cached_array(ms):
    ds = xds_from_ms(ms, group_cols=[], chunks={'row': 1, 'chan': 4})[0]

    data = ds.DATA.data
    cached_data = cached_array(data)
    assert_array_almost_equal(cached_data, data)

    # 2 x row blocks + row x chan x corr blocks
    assert len(_key_cache) == data.numblocks[0] * 2 + data.npartitions
    # rows, row runs and data array cache's
    assert len(_array_cache_cache) == 3

    # Pickling works
    pickled_data = pickle.loads(pickle.dumps(cached_data))
    assert_array_almost_equal(pickled_data, data)

    # Same underlying caching is re-used
    # 2 x row blocks + row x chan x corr blocks
    assert len(_key_cache) == data.numblocks[0] * 2 + data.npartitions
    # rows, row runs and data array cache's
    assert len(_array_cache_cache) == 3

    del pickled_data, cached_data, data, ds
    gc.collect()

    assert len(_key_cache) == 0
    assert len(_array_cache_cache) == 0
예제 #2
0
def _group_ordering_arrays(taql_proxy, index_cols, group, group_nrows,
                           group_row_chunks):
    """
    Returns
    -------
    sorted_rows : :class:`dask.array.Array`
        Sorted table rows chunked on ``group_row_chunks``.
    row_runs : :class:`dask.array.Array`.
        Array containing (row_run, resort) tuples.
        Should not be directly computed.
        Chunked on ``group_row_chunks``.
    """
    token = dask.base.tokenize(taql_proxy, group, group_nrows)
    name = 'group-rows-' + token
    chunks = ((group_nrows, ), )
    layers = {(name, 0): (_sorted_group_rows, taql_proxy, group, index_cols)}

    graph = HighLevelGraph.from_collections(name, layers, [])
    group_rows = da.Array(graph, name, chunks, dtype=np.int32)
    group_rows = cached_array(group_rows)

    try:
        shape = (group_nrows, )
        group_row_chunks = normalize_chunks(group_row_chunks, shape=shape)
    except ValueError as e:
        raise GroupChunkingError("%s\n"
                                 "Unable to match chunks '%s' "
                                 "with shape '%s' for group '%d'. "
                                 "This can occur if too few chunk "
                                 "dictionaries have been supplied for "
                                 "the number of groups "
                                 "and an earlier group's chunking strategy "
                                 "is applied to a later one." %
                                 (str(e), group_row_chunks, shape, group))

    group_rows = group_rows.rechunk(group_row_chunks)
    row_runs = group_rows.map_blocks(row_run_factory,
                                     sort_dir="read",
                                     dtype=np.object)

    row_runs = cached_array(row_runs)

    return group_rows, row_runs
예제 #3
0
def row_ordering(taql_proxy, index_cols, chunks):
    nrows = taql_proxy.nrows().result()
    chunks = normalize_chunks(chunks['row'], shape=(nrows, ))
    token = dask.base.tokenize(taql_proxy, index_cols, chunks, nrows)
    name = 'rows-' + token
    layers = {}
    start = 0

    for i, c in enumerate(chunks[0]):
        layers[(name, i)] = (_sorted_rows, taql_proxy, start, c)
        start += c

    graph = HighLevelGraph.from_collections(name, layers, [])
    rows = da.Array(graph, name, chunks=chunks, dtype=np.int64)
    rows = cached_array(rows)
    row_runs = rows.map_blocks(row_run_factory, sort_dir="read", dtype=object)
    row_runs = cached_array(row_runs)

    return rows, row_runs
예제 #4
0
def test_cached_data_token(token):
    zeros = da.zeros(1000, chunks=100)
    carray = cached_array(zeros, token)

    dsk = dict(carray.__dask_graph__())
    k, v = dsk.popitem()
    cache = v[1]

    if token is None:
        assert cache.token is not None
    else:
        assert cache.token == token
예제 #5
0
def _write_datasets(table, table_proxy, datasets, columns, descriptor,
                    table_keywords, column_keywords):
    _, table_name, subtable = table_path_split(table)
    table_name = '::'.join((table_name, subtable)) if subtable else table_name
    row_orders = []

    # Put table and column keywords
    table_proxy.submit(_put_keywords, WRITELOCK, table_keywords,
                       column_keywords).result()

    # Sort datasets on (not has "ROWID", index) such that
    # datasets with ROWID's are handled first, while
    # those without (which imply appends to the MS)
    # are handled last
    sorted_datasets = sorted(enumerate(datasets),
                             key=lambda t:
                             ("ROWID" not in t[1].data_vars, t[0]))

    # Establish row orders for each dataset
    for di, ds in sorted_datasets:
        try:
            rowid = ds.ROWID.data
        except AttributeError:
            # Add operation
            # No ROWID's, assume they're missing from the table
            # and remaining datasets. Generate addrows
            # NOTE(sjperkins)
            # This could be somewhat brittle, but exists to
            # update MS empty subtables once they've been
            # created along with the main MS by a call to default_ms.
            # Users could also it to append rows to an existing table.
            # An xds_append_to_table may be a better solution...
            last_datasets = datasets[di:]
            last_row_orders = add_row_order_factory(table_proxy, last_datasets)

            # We don't inline the row ordering if it is derived
            # from the row sizes of provided arrays.
            # The range of possible dependencies are far too large to inline
            row_orders.extend([(False, lro) for lro in last_row_orders])
            # We have established row orders for all datasets
            # at this point, quit the loop
            break
        else:
            # Update operation
            # Generate row orderings from existing row IDs
            row_order = rowid.map_blocks(row_run_factory,
                                         sort_dir="write",
                                         dtype=np.object)

            # TODO(sjperkins)
            # There's an assumption here that rowid is an
            # operation with minimal dependencies
            # (i.e. derived from xds_from_{ms, table})
            # Caching flattens the graph into a single layer
            if len(row_order.__dask_graph__().layers) > 1:
                log.warning("Caching an update row ordering "
                            "with more than one layer")

            row_order = cached_array(row_order)
            # Inline the row ordering in the graph
            row_orders.append((True, row_order))

    assert len(row_orders) == len(datasets)

    datasets = []

    for (di, ds), (inline, row_order) in zip(sorted_datasets, row_orders):
        # Hold the variables representing array writes
        write_vars = {}

        # Generate a dask array for each column
        for column in columns:
            try:
                variable = ds.data_vars[column]
            except KeyError:
                log.warning("Ignoring '%s' not present "
                            "on dataset %d" % (column, di))
                continue
            else:
                full_dims = variable.dims
                array = variable.data

            if not isinstance(array, da.Array):
                raise TypeError("%s on dataset %d is not a dask Array "
                                "but a %s" % (column, di, type(array)))

            args = [row_order, ("row", )]

            # We only need to pass in dimension extent arrays if
            # there is more than one chunk in any of the non-row columns.
            # In that case, we can putcol, otherwise putcolslice is required
            if not all(len(c) == 1 for c in array.chunks[1:]):
                # Add extent arrays
                for d, c in zip(full_dims[1:], array.chunks[1:]):
                    args.append(dim_extents_array(d, c))
                    args.append((d, ))

            # Add other variables
            args.extend([table_proxy, None, column, None, array, full_dims])

            # Name of the dask array representing this column
            token = dask.base.tokenize(di, args)
            name = "-".join((table_name, 'write', column, token))

            write_col = da.blockwise(
                putter_wrapper,
                full_dims,
                *args,
                # All dims shrink to 1,
                # a single bool is returned
                adjust_chunks={d: 1
                               for d in full_dims},
                name=name,
                align_arrays=False,
                dtype=np.bool)

            if inline:
                write_col = inlined_array(write_col, [row_order])

            write_vars[column] = (full_dims, write_col)

        # Append a dataset with the write operations
        datasets.append(Dataset(write_vars))

    # Return an empty dataset
    if len(datasets) == 0:
        return Dataset({})
    # Return singleton
    elif len(datasets) == 1:
        return datasets[0]

    return datasets
예제 #6
0
파일: writes.py 프로젝트: hrkloeck/dask-ms
def cached_row_order(rowid):
    """
    Produce a cached row_order array from the given rowid array.

    There's an assumption here that rowid is an
    operation with minimal dependencies
    (i.e. derived from xds_from_{ms, table})
    Caching flattens the graph into one or two layers
    depending on whether standard or group ordering is requested

    Therfore, this functions warns if the rowid graph looks unusual,
    mostly because it'll be included in the cached row_order array,
    so we don't want it's graph to be too big or unusual.

    Parameters
    ----------
    rowid : :class:`dask.array.Array`
        rowid array

    Returns
    -------
    row_order : :class:`dask.array.Array`
        A array of row order tuples
    """
    layers = rowid.__dask_graph__().layers

    # daskms.ordering.row_ordering case
    # or daskms.ordering.group_row_ordering case without rechunking
    # Check for standard layer
    if len(layers) == 1:
        layer_name = list(layers.keys())[0]

        if (not layer_name.startswith("row-")
                and not layer_name.startswith("group-rows-")):

            log.warning(
                "Unusual ROWID layer %s. "
                "This is probably OK but "
                "could foreshadow incorrect "
                "behaviour.", layer_name)
    # daskms.ordering.group_row_ordering case with rechunking
    # Check for standard layers
    elif len(layers) == 2:
        layer_names = list(sorted(layers.keys()))

        if not (layer_names[0].startswith('group-rows-')
                and layer_names[1].startswith('rechunk-merge-')):

            log.warning(
                "Unusual ROWID layers %s for "
                "the group ordering case. "
                "This is probably OK but "
                "could foreshadow incorrect "
                "behaviour.", layer_names)
    # ROWID has been extended or modified somehow, warn
    else:
        layer_names = list(sorted(layers.keys()))
        log.warning(
            "Unusual number of ROWID layers > 2 "
            "%s. This is probably OK but "
            "could foreshadow incorrect "
            "behaviour or sub-par performance if "
            "the ROWID graph is large.", layer_names)

    row_order = rowid.map_blocks(row_run_factory,
                                 sort_dir="write",
                                 dtype=np.object)

    return cached_array(row_order)