Exemplo n.º 1
0
def get_quantized_ctable(dtype, cparams, quantize=None, expectedlen=None):
    """Return a ctable with the quantize filter enabled for floating point cols.
    
    License
        This function is taken from the reflexible package (https://github.com/spectraphilic/reflexible/tree/master/reflexible).
        Authored by John F Burkhart <*****@*****.**> with contributions Francesc Alted <*****@*****.**>.
        Licensed under: 'This script follows creative commons usage.'
    """
    columns, names = [], []
    for fname, ftype in dtype.descr:
        names.append(fname)
        if 'f' in ftype:
            cparams2 = bcolz.cparams(clevel=cparams.clevel,
                                     cname=cparams.cname,
                                     quantize=quantize)
            columns.append(
                bcolz.zeros(0,
                            dtype=ftype,
                            cparams=cparams2,
                            expectedlen=expectedlen))
        else:
            columns.append(
                bcolz.zeros(0,
                            dtype=ftype,
                            cparams=cparams,
                            expectedlen=expectedlen))
    return bcolz.ctable(columns=columns, names=names)
Exemplo n.º 2
0
def get_quantized_ctable(dtype, cparams, quantize=None, expectedlen=None):
    """Return a ctable with the quantize filter enabled for floating point cols.
    """
    import bcolz

    columns, names = [], []
    for fname, ftype in dtype.descr:
        names.append(fname)
        if 'f' in ftype:
            cparams2 = bcolz.cparams(clevel=cparams.clevel, cname=cparams.cname, quantize=quantize)
            columns.append(bcolz.zeros(0, dtype=ftype, cparams=cparams2, expectedlen=expectedlen))
        else:
            columns.append(bcolz.zeros(0, dtype=ftype, cparams=cparams, expectedlen=expectedlen))
    return bcolz.ctable(columns=columns, names=names)
Exemplo n.º 3
0
def test_into_inplace():
    x = np.arange(600).reshape((20, 30))
    a = into(Array, x, blockshape=(4, 5))
    b = bcolz.zeros(shape=(20, 30), dtype=x.dtype)

    append(b, a, inplace=True)
    assert eq(b[:], x)
Exemplo n.º 4
0
def test_into_inplace():
    x = np.arange(600).reshape((20, 30))
    a = into(Array, x, blockshape=(4, 5))
    b = bcolz.zeros(shape=(20, 30), dtype=x.dtype)

    append(b, a, inplace=True)
    assert eq(b[:], x)
Exemplo n.º 5
0
 def test01c(self):
     """Testing `zeros` constructor (III)"""
     a = np.zeros((2, 2), dtype='(4,)i4')
     b = bcolz.zeros((2, 2), dtype='(4,)i4', rootdir=self.rootdir)
     if self.open:
         b = bcolz.open(rootdir=self.rootdir)
     # print "b->", `b`
     assert_array_equal(a, b, "Arrays are not equal")
Exemplo n.º 6
0
def read_releases_v10(pathname):
    """
    Parses release file in `pathname` and return a ctable with its contents.

    This is only suited for files in Fortran90 namelist format (FP v10).

    Parameters
    ----------
    pathname : pathname
      Release file name (in Fortran90 namelist format).

    Returns
    -------
    A ctable object from bcolz package.
    """
    import bcolz

    # Setup the container for the data
    dtype = [('IDATE1', np.int32), ('ITIME1', np.int32), ('IDATE2', np.int32),
             ('ITIME2', np.int32), ('LON1', np.float32), ('LON2', np.float32),
             ('LAT1', np.float32), ('LAT2', np.float32), ('Z1', np.float32),
             ('Z2', np.float32), ('ZKIND', np.int8), ('MASS', np.float32),
             ('PARTS', np.int32), ('COMMENT', 'S32')]
    cparams = bcolz.cparams(cname="lz4", clevel=6, shuffle=1)
    ctable = bcolz.zeros(0, dtype=dtype, cparams=cparams)
    nrecords = ctable['IDATE1'].chunklen
    releases = np.zeros(nrecords, dtype=dtype)

    # Prepare for reading the input
    input_str = open(pathname, 'r').read()
    marker = "&RELEASE\n"
    len_marker = len(marker)
    release_re = r'\S+=\s+[\"|\s](\S+)[,|\"|\w]'

    # Loop over all the marker groups
    i, n = 0, 0
    while True:
        i = input_str.find(marker, i)
        j = input_str.find(marker, i + 1)
        n += 1
        group_block = input_str[i + len_marker:j]
        i = j
        values = tuple(re.findall(release_re, group_block))
        try:
            releases[(n - 1) % nrecords] = values
        except ValueError:
            print("Problem at: group: %d, %s" % (n, group_block))
            print("values:", values)
            raise
        if (n % nrecords) == 0:
            ctable.append(releases)
        if (i == -1) or (j == -1):
            break  # marker is not found anymore
    # Remainder
    ctable.append(releases[:n % nrecords])
    ctable.flush()

    return ctable
Exemplo n.º 7
0
 def test00a(self):
     """Testing wheretrue() in combination with a list constructor"""
     a = bcolz.zeros(self.N, dtype="bool", rootdir=self.rootdir)
     a[30:40] = bcolz.ones(10, dtype="bool")
     alist = list(a)
     blist1 = [r for r in a.wheretrue()]
     self.assertTrue(blist1 == list(range(30, 40)))
     alist2 = list(a)
     self.assertTrue(alist == alist2, "wheretrue() not working correctly")
Exemplo n.º 8
0
    def __init__(self, df, rootdir=None, chunklen=2**16, **kwargs):
        if rootdir is None:
            rootdir = tempfile.mkdtemp('.cframe')
            self._explicitly_given_path = False
        else:
            os.mkdir(rootdir)
            self._explicitly_given_path = True

        self.blocks = dict((col,
                       bcolz.zeros(rootdir=os.path.join(rootdir, '%s.bcolz' % col),
                                   shape=(0,),
                                   dtype=df.dtypes[col], safe=False,
                                   chunklen=chunklen, **kwargs))
                        for col in df.columns)
        self.columns = df.columns
        self.index = bcolz.zeros(shape=(0,), dtype=df.index.values.dtype,
                                 safe=False, chunklen=chunklen, **kwargs)
        self.rootdir = rootdir
Exemplo n.º 9
0
 def test00a(self):
     """Testing wheretrue() in combination with a list constructor"""
     a = bcolz.zeros(self.N, dtype="bool", rootdir=self.rootdir)
     a[30:40] = bcolz.ones(10, dtype="bool")
     alist = list(a)
     blist1 = [r for r in a.wheretrue()]
     self.assertTrue(blist1 == list(range(30, 40)))
     alist2 = list(a)
     self.assertTrue(alist == alist2, "wheretrue() not working correctly")
Exemplo n.º 10
0
 def getobject(self):
     if self.flavor == 'carray':
         obj = bcolz.zeros(10, dtype="i1", rootdir=self.rootdir)
         assert type(obj) == bcolz.carray
     elif self.flavor == 'ctable':
         obj = bcolz.fromiter(((i, i*2) for i in range(10)), dtype='i2,f4',
                              count=10, rootdir=self.rootdir)
         assert type(obj) == bcolz.ctable
     return obj
Exemplo n.º 11
0
    def __init__(self, n_elements, sizes=None, dtype=float, datadir=None):
        """
        Create a new ring buffer with the given number of elements\n
        individual element size and element type

        Parameters:
        -----------
        n_elements: int
            The number of elements (individual ring buffers)
        sizes: int, list or array
            Size for all elements or list/array with sizes for elements
        dtype: data-type, optional (default=float)
            Data type of the ring buffer
        datadir: str
            If specified the data is stored on disk (default=None)
        """

        self._arr = np.empty(n_elements, object)
        if isinstance(sizes, (list, np.ndarray)):
            self._sizes = np.array(sizes)
            for i in range(len(self._arr)):
                if datadir is not None:
                    self._arr[i] = bc.zeros(int(sizes[i]),
                                            dtype,
                                            rootdir=datadir + '/arr_' + str(i),
                                            mode='w')
                else:
                    self._arr[i] = bc.zeros(int(sizes[i]), dtype)
        elif isinstance(sizes, int):
            self._sizes = np.array([sizes] * len(self._arr))
            for i in range(len(self._arr)):
                if datadir is not None:
                    self._arr[i] = bc.zeros(int(sizes),
                                            dtype,
                                            rootdir=datadir + '/arr_' + str(i),
                                            mode='w')
                else:
                    self._arr[i] = bc.zeros(int(sizes), dtype)

        self._dtype = dtype
        self._sizes = np.zeros(n_elements, int)
        if sizes is not None:
            self._sizes += sizes
        self._indices = np.zeros(n_elements, dtype=int)
Exemplo n.º 12
0
 def test01a(self):
     """Testing where() in combination with a list constructor"""
     a = bcolz.zeros(self.N, dtype="bool")
     a[30:40] = bcolz.ones(10, dtype="bool")
     b = bcolz.arange(self.N, dtype="f4")
     blist = list(b)
     blist1 = [r for r in b.where(a)]
     self.assert_(blist1 == range(30,40))
     blist2 = list(b)
     self.assert_(blist == blist2, "where() not working correctly")
Exemplo n.º 13
0
def test_append_to_array():
    x = np.arange(600).reshape((20, 30))
    a = into(Array, x, blockshape=(4, 5))
    b = bcolz.zeros(shape=(0, 30), dtype=x.dtype)

    append(b, a)
    assert eq(b[:], x)

    with tmpfile('hdf5') as fn:
        h = into(fn+'::/data', a)
        assert eq(h[:], x)
        h.file.close()
Exemplo n.º 14
0
def test_append_to_array():
    x = np.arange(600).reshape((20, 30))
    a = into(Array, x, blockshape=(4, 5))
    b = bcolz.zeros(shape=(0, 30), dtype=x.dtype)

    append(b, a)
    assert eq(b[:], x)

    with tmpfile('hdf5') as fn:
        h = into(fn + '::/data', a)
        assert eq(h[:], x)
        h.file.close()
Exemplo n.º 15
0
def preprocess_brats(data_path,
                     path,
                     size,
                     train_data=True,
                     x_name='data',
                     y_name='labels',
                     csv_name='mapping.csv'):
    path = Path(path)
    path.mkdir(parents=True, exist_ok=True)
    size = listify(size, 3)
    volumes = bcolz.zeros([0, 4, *size],
                          dtype=np.int64,
                          chunklen=1,
                          mode='w',
                          rootdir=path / x_name)
    if train_data:
        labels = bcolz.zeros([0, *size],
                             dtype=np.int64,
                             chunklen=1,
                             mode='w',
                             rootdir=path / y_name)
    processors = [CropProcessor(), ResizeProcessor(size)]
    files = (data_path / 'LGG').ls() + (
        data_path / 'HGG').ls() if train_data else data_path.ls()

    (path / csv_name
     ).open('w').write('modal,subject\n' if train_data else 'subject\n')
    with (path / csv_name).open('a') as f:
        for file in progress_bar(files):
            if not file.is_dir(): continue
            x, y = get_brats_data(file, train_data)
            for p in processors:
                x, y = p(x, y)
            volumes.append(x)
            if train_data: labels.append(y)
            f.write(f'{file.parent.name},{file.name}\n'
                    if train_data else f'{file.name}\n')

    volumes.flush()
    if train_data: labels.flush()
def preprocess_brain_val(source_dir: Path, destination_dir: Path):
    # Full Validation 100 (out of 800) brain 2d.
    slice_w = 256
    slice_h = 256

    train = list(range(800))
    random.seed(0)
    random.shuffle(train)
    disk_x = bcolz.zeros((0, 1, slice_w, slice_h),
                         rootdir=destination_dir,
                         chunklen=1)

    for i in train[700:]:
        volume = nib.load(source_dir / "{:05}.nii.gz".format(i)).get_fdata()

        for s in range(256):
            x = volume[None, None, :, :, s]
            disk_x.append(x)
            disk_x.flush()
Exemplo n.º 17
0
    def __init__(self,
                 data_element_shape,
                 dtype,
                 batch_size,
                 save_path,
                 length=None,
                 append=False,
                 kwargs={}):
        import bcolz
        super(bcolz_array_writer, self).__init__(None, data_element_shape,
                                                 dtype, batch_size, length)
        self.save_path = save_path
        self.kwargs = kwargs

        # Set up array kwargs
        self.arr_kwargs = {
            'expectedlen': length,
            'cparams': bcolz.cparams(clevel=5, shuffle=True, cname='blosclz'),
            'dtype': dtype,
            'rootdir': save_path
        }
        if kwargs is not None:
            self.arr_kwargs.update(kwargs)

        # Create the file-backed array, open for writing.
        # (check if the array exists; if not, create it)
        if append:
            try:
                self.storage_array = bcolz.open(self.save_path, mode='a')
                self.storage_array_ptr = len(self.storage_array)
            except FileNotFoundError:
                append = False
        if not append:
            try:
                self.storage_array = bcolz.zeros(shape=(0, ) +
                                                 data_element_shape,
                                                 mode='w',
                                                 **self.arr_kwargs)
                self.storage_array_ptr = 0
            except:
                print("Error: failed to create file-backed bcolz storage "
                      "array.")
                raise
def preprocess_abdomen_val(source_dir: Path, destination_path: Path):
    # Abdom data val 2d, downsample by 2x, 50/500 volumes, all slices
    slice_w = 256  # 512
    slice_h = 256  # 512

    train = list(range(550))
    random.seed(0)
    random.shuffle(train)
    disk_x = bcolz.zeros((0, 1, slice_w, slice_h),
                         rootdir=destination_path,
                         chunklen=1)

    for i in train[450:]:
        volume = nib.load(source_dir / "{:05}.nii.gz".format(i)).get_fdata()
        #volume = nib.load("../data/abdom/abdom_train/{:05}.nii.gz".format(i)).get_fdata()

        slices = list(range(0, 512, 2))
        for s in slices:
            x = volume[None, None, :, :, s]  # .transpose([0, 1, 3, 2])
            x = zoom(x[0, 0, ...], 0.5, order=2)

            x = x[None, None, ...]
            disk_x.append(x)
            disk_x.flush()
Exemplo n.º 19
0
from time import time

import numpy as np

import bcolz

N = 2e8
dtype = 'i4'

t0 = time()
a = np.zeros(N, dtype=dtype)
print("Time numpy.zeros() --> %.4f" % (time() - t0))

t0 = time()
ac = bcolz.zeros(N, dtype=dtype)
# ac = bcolz.carray(a)
print("Time bcolz.zeros() --> %.4f" % (time() - t0))

print("ac-->", repr(ac))

#assert(np.all(a == ac))
Exemplo n.º 20
0
    def make_group_index(self, factor_list, values_list, groupby_cols,
                         array_length, bool_arr):
        # create unique groups for groupby loop

        if len(factor_list) == 0:
            # no columns to groupby over, so directly aggregate the measure
            # columns to 1 total (index 0/zero)
            factor_carray = bcolz.zeros(array_length, dtype='int64')
            values = ['Total']

        elif len(factor_list) == 1:
            # single column groupby, the groupby output column
            # here is 1:1 to the values
            factor_carray = factor_list[0]
            values = values_list[0]

        else:
            # multi column groupby
            # nb: this might also be cached in the future

            # first combine the factorized columns to single values
            factor_set = {x: y for x, y in zip(groupby_cols, factor_list)}

            # create a numexpr expression that calculates the place on
            # a cartesian join index
            eval_str = ''
            previous_value = 1
            for col, values \
                    in zip(reversed(groupby_cols), reversed(values_list)):
                if eval_str:
                    eval_str += ' + '
                eval_str += str(previous_value) + '*' + col
                previous_value *= len(values)

            # calculate the cartesian group index for each row
            factor_input = bcolz.eval(eval_str, user_dict=factor_set)

            # now factorize the unique groupby combinations
            factor_carray, values = ctable_ext.factorize(factor_input)

        skip_key = None

        if bool_arr is not None:
            # make all non relevant combinations -1
            factor_carray = bcolz.eval(
                '(factor + 1) * bool - 1',
                user_dict={'factor': factor_carray, 'bool': bool_arr})
            # now check how many unique values there are left
            factor_carray, values = ctable_ext.factorize(factor_carray)
            # values might contain one value too much (-1) (no direct lookup
            # possible because values is a reversed dict)
            filter_check = \
                [key for key, value in values.iteritems() if value == -1]
            if filter_check:
                skip_key = filter_check[0]

        # using nr_groups as a total length might be one one off due to the skip_key
        # (skipping a row in aggregation)
        # but that is okay normally
        nr_groups = len(values)
        if skip_key is None:
            # if we shouldn't skip a row, set it at the first row after the total number of groups
            skip_key = nr_groups

        return factor_carray, nr_groups, skip_key
Exemplo n.º 21
0
 def test01b(self):
     """Testing where() with a multidimensional array"""
     a = bcolz.zeros((self.N, 10), dtype="bool", rootdir=self.rootdir)
     a[30:40] = bcolz.ones(10, dtype="bool")
     b = bcolz.arange(self.N * 10, dtype="f4").reshape((self.N, 10))
     self.assertRaises(NotImplementedError, b.where, a)
Exemplo n.º 22
0
    def make_group_index(self, factor_list, values_list, groupby_cols,
                         array_length, bool_arr):
        '''Create unique groups for groupby loop

            Args:
                factor_list:
                values_list:
                groupby_cols:
                array_length:
                bool_arr:

            Returns:
                carray: (factor_carray)
                int: (nr_groups) the number of resulting groups
                int: (skip_key)
        '''

        def _create_eval_str(groupby_cols, values_list, check_overflow=True):

            eval_list = []
            eval_str = ''
            col_list = []
            previous_value = 1
            # Sort evaluated columns by length
            col_len_list = [(col, values) for col, values in zip(groupby_cols, values_list)]
            col_len_list.sort(key=lambda x: len(x[1]))
            groupby_cols = [col for col, _ in col_len_list]
            values_list = [values for _, values in col_len_list]

            for col, values \
                    in zip(groupby_cols, values_list):

                # check for overflow
                if check_overflow:
                    if previous_value * len(values) > 4294967295:
                        eval_list.append((eval_str, col_list))
                        # reset
                        eval_str = ''
                        col_list = []
                        previous_value = 1

                if eval_str:
                    eval_str += ' + '
                else:
                    eval_str += '-2147483648 + '

                eval_str += str(previous_value) + '*' + col
                col_list.append(col)
                previous_value *= len(values)

            eval_list.append((eval_str, col_list))
            return eval_list

        def _calc_group_index(eval_list, factor_set, vm=None):
            factorize_list = []
            for eval_node in eval_list:
                # calculate the cartesian group index for each row
                factor_input = bcolz.eval(eval_node[0], user_dict=factor_set, vm=vm)
                # now factorize the unique groupby combinations
                sub_factor_carray, sub_values = ctable_ext.factorize(factor_input)
                factorize_list.append((sub_factor_carray, sub_values))
            return factorize_list

        def _is_reducible(eval_list):
            for eval_node in eval_list:
                if len(eval_node[1]) > 1:
                    return True
            return False

        def calc_index(groupby_cols, values_list, factor_set, vm=None):
            # Initialize eval list
            eval_list = _create_eval_str(groupby_cols, values_list)

            # Reduce expression as possible
            while _is_reducible(eval_list):
                del groupby_cols
                del values_list
                factorize_list = _calc_group_index(eval_list, factor_set)
                factor_set = {'g' + str(i): x[0] for i, x in enumerate(factorize_list)}
                groupby_cols = ['g' + str(i) for i, x in enumerate(factorize_list)]
                values_list = [x[1] for i, x in enumerate(factorize_list)]
                eval_list = _create_eval_str(groupby_cols, values_list)
            # If we have multiple expressions that cannot be reduced anymore, rewrite as a single one and use Python vm
            if len(eval_list) > 1:
                eval_list = _create_eval_str(groupby_cols, values_list, check_overflow=False)
                vm = 'python'

            del groupby_cols
            del values_list

            # Now we have a single expression, factorize it
            return _calc_group_index(eval_list, factor_set, vm=vm)[0]

        # create unique groups for groupby loop
        if len(factor_list) == 0:
            # no columns to groupby over, so directly aggregate the measure
            # columns to 1 total (index 0/zero)
            factor_carray = bcolz.zeros(array_length, dtype='int64')
            values = ['Total']
        elif len(factor_list) == 1:
            # single column groupby, the groupby output column
            # here is 1:1 to the values
            factor_carray = factor_list[0]
            values = values_list[0]
        else:
            # multi column groupby
            # nb: this might also be cached in the future
            # first combine the factorized columns to single values
            factor_set = {x: y for x, y in zip(groupby_cols, factor_list)}
            # create a numexpr expression that calculates the place on
            # a cartesian join index
            factor_carray, values = calc_index(groupby_cols, values_list, factor_set)

        skip_key = None

        if bool_arr is not None:
            # make all non relevant combinations -1
            factor_carray = bcolz.eval(
                '(factor + 1) * bool - 1',
                user_dict={'factor': factor_carray, 'bool': bool_arr})
            # now check how many unique values there are left
            factor_carray, values = ctable_ext.factorize(factor_carray)
            # values might contain one value too much (-1) (no direct lookup
            # possible because values is a reversed dict)
            filter_check = \
                [key for key, value in values.items() if value == -1]
            if filter_check:
                skip_key = filter_check[0]

        # using nr_groups as a total length might be one one off due to the skip_key
        # (skipping a row in aggregation)
        # but that is okay normally
        nr_groups = len(values)
        if skip_key is None:
            # if we shouldn't skip a row, set it at the first row after the total number of groups
            skip_key = nr_groups

        return factor_carray, nr_groups, skip_key
Exemplo n.º 23
0
 def test01b(self):
     """Testing where() with a multidimensional array"""
     a = bcolz.zeros((self.N, 10), dtype="bool", rootdir=self.rootdir)
     a[30:40] = bcolz.ones(10, dtype="bool")
     b = bcolz.arange(self.N * 10, dtype="f4").reshape((self.N, 10))
     self.assertRaises(NotImplementedError, b.where, a)
Exemplo n.º 24
0
 def test00b(self):
     """Testing wheretrue() with a multidimensional array"""
     a = bcolz.zeros((self.N, 10), dtype="bool", rootdir=self.rootdir)
     a[30:40] = bcolz.ones(10, dtype="bool")
     self.assertRaises(NotImplementedError, a.wheretrue)
Exemplo n.º 25
0
def read_partpositions(filename,
                       nspec,
                       ctable=True,
                       clevel=5,
                       cname="lz4",
                       quantize=None):
    """Read the particle positions in `filename`.

    This function strives to use as less memory as possible; for this, a
    bcolz ctable container is used for holding the data.  Besides to be compressed
    in-memory, its chunked nature makes a natural fit for data that needs to
    be appended because it does not need expensive memory resize operations.

    NOTE: This code reads directly from un UNFORMATTED SEQUENTIAL data Fortran
    file so care has been taken to skip the record length at the beginning and
    the end of every record.  See:
    http://stackoverflow.com/questions/8751185/fortran-unformatted-file-format

    Parameters
    ----------
    filename : string
        The file name of the particle raw data
    nspec : int
        number of species in particle raw data
    ctable : bool
        Return a bcolz ctable container.  If not, a numpy structured array is returned instead.
    clevel : int
        Compression level for the ctable container
    cname : string
        Codec name for the ctable container.  Can be 'blosclz', 'lz4', 'zlib' or 'zstd'.
    quantize : int
        Quantize data to improve (lossy) compression.  Data is quantized using
        np.around(scale*data)/scale, where scale is 2**bits, and bits is
        determined from the quantize value.  For example, if quantize=1, bits
        will be 4.  0 means that the quantization is disabled.

    Returns
    -------
    ctable object OR structured_numpy_array

    Returning a ctable is preferred because it is used internally so it does not require to be
    converted to other formats, so it is faster and uses less memory.

    Note: Passing a `quantize` param > 0 can increase the compression ratio of the ctable
    container, but it may also slow down the reading speed significantly.

    License
        This function is taken from the reflexible package (https://github.com/spectraphilic/reflexible/tree/master/reflexible).
        Authored by John F Burkhart <*****@*****.**> with contributions Francesc Alted <*****@*****.**>.
        Licensed under: 'This script follows creative commons usage.'


    """

    CHUNKSIZE = 10 * 1000
    xmass_dtype = [('xmass_%d' % (i + 1), 'f4') for i in range(nspec)]
    # note age is calculated from itramem by adding itimein
    out_fields = [('npoint', 'i4'), ('xtra1', 'f4'), ('ytra1', 'f4'),
                  ('ztra1', 'f4'), ('itramem', 'i4'), ('topo', 'f4'),
                  ('pvi', 'f4'), ('qvi', 'f4'), ('rhoi', 'f4'),
                  ('hmixi', 'f4'), ('tri', 'f4'), ('tti', 'f4')] + xmass_dtype
    raw_fields = [('begin_recsize', 'i4')
                  ] + out_fields + [('end_recsize', 'i4')]
    raw_rectype = np.dtype(raw_fields)
    recsize = raw_rectype.itemsize

    cparams = bcolz.cparams(clevel=clevel, cname=cname)
    if quantize is not None and quantize > 0:
        out = get_quantized_ctable(raw_rectype,
                                   cparams=cparams,
                                   quantize=quantize,
                                   expectedlen=int(1e6))
    else:
        out = bcolz.zeros(0,
                          dtype=raw_rectype,
                          cparams=cparams,
                          expectedlen=int(1e6))

    with open(filename, "rb", buffering=1) as f:
        # The timein value is at the beginning of the file
        reclen = np.ndarray(shape=(1, ), buffer=f.read(4), dtype="i4")[0]
        assert reclen == 4
        itimein = np.ndarray(shape=(1, ), buffer=f.read(4), dtype="i4")
        reclen = np.ndarray(shape=(1, ), buffer=f.read(4), dtype="i4")[0]
        assert reclen == 4
        nrec = 0
        while True:
            # Try to read a complete chunk
            data = f.read(CHUNKSIZE * recsize)
            read_records = int(len(data) /
                               recsize)  # the actual number of records read
            chunk = np.ndarray(shape=(read_records, ),
                               buffer=data,
                               dtype=raw_rectype)
            # Add the chunk to the out array
            out.append(chunk[:read_records])
            nrec += read_records
            if read_records < CHUNKSIZE:
                # We reached the end of the file
                break

    # Truncate at the max length (last row is always a sentinel, so remove it)
    out.trim(1)
    # Remove the first and last columns
    out.delcol("begin_recsize")
    out.delcol("end_recsize")

    if ctable:
        return out
    else:
        return out[:]
Exemplo n.º 26
0
Arquivo: zeros.py Projeto: Blosc/bcolz
from time import time

import numpy as np

import bcolz


N = 2e8
dtype = 'i4'

t0 = time()
a = np.zeros(N, dtype=dtype)
print("Time numpy.zeros() --> %.4f" % (time() - t0))

t0 = time()
ac = bcolz.zeros(N, dtype=dtype)
# ac = bcolz.carray(a)
print("Time bcolz.zeros() --> %.4f" % (time() - t0))

print("ac-->", repr(ac))

#assert(np.all(a == ac))
Exemplo n.º 27
0
 def test00b(self):
     """Testing wheretrue() with a multidimensional array"""
     a = bcolz.zeros((self.N, 10), dtype="bool", rootdir=self.rootdir)
     a[30:40] = bcolz.ones(10, dtype="bool")
     self.assertRaises(NotImplementedError, a.wheretrue)
Exemplo n.º 28
0
    def make_group_index(self, factor_list, values_list, groupby_cols,
                         array_length, bool_arr):
        # create unique groups for groupby loop

        if len(factor_list) == 0:
            # no columns to groupby over, so directly aggregate the measure
            # columns to 1 total (index 0/zero)
            factor_carray = bcolz.zeros(array_length, dtype='int64')
            values = ['Total']

        elif len(factor_list) == 1:
            # single column groupby, the groupby output column
            # here is 1:1 to the values
            factor_carray = factor_list[0]
            values = values_list[0]

        else:
            # multi column groupby
            # nb: this might also be cached in the future

            # first combine the factorized columns to single values
            factor_set = {x: y for x, y in zip(groupby_cols, factor_list)}

            # create a numexpr expression that calculates the place on
            # a cartesian join index
            eval_str = ''
            previous_value = 1
            for col, values \
                    in zip(reversed(groupby_cols), reversed(values_list)):
                if eval_str:
                    eval_str += ' + '
                eval_str += str(previous_value) + '*' + col
                previous_value *= len(values)

            # calculate the cartesian group index for each row
            factor_input = bcolz.eval(eval_str, user_dict=factor_set)

            # now factorize the unique groupby combinations
            factor_carray, values = ctable_ext.factorize(factor_input)

        skip_key = None

        if bool_arr is not None:
            # make all non relevant combinations -1
            factor_carray = bcolz.eval('(factor + 1) * bool - 1',
                                       user_dict={
                                           'factor': factor_carray,
                                           'bool': bool_arr
                                       })
            # now check how many unique values there are left
            factor_carray, values = ctable_ext.factorize(factor_carray)
            # values might contain one value too much (-1) (no direct lookup
            # possible because values is a reversed dict)
            filter_check = \
                [key for key, value in values.iteritems() if value == -1]
            if filter_check:
                skip_key = filter_check[0]

        # using nr_groups as a total length might be one one off due to the skip_key
        # (skipping a row in aggregation)
        # but that is okay normally
        nr_groups = len(values)
        if skip_key is None:
            # if we shouldn't skip a row, set it at the first row after the total number of groups
            skip_key = nr_groups

        return factor_carray, nr_groups, skip_key
Exemplo n.º 29
0
    def make_group_index(self, groupby_cols, bool_arr):
        '''Create unique groups for groupby loop

            Args:
                factor_list:
                values_list:
                groupby_cols:
                bool_arr:

            Returns:
                carray: (carray_factor)
                int: (nr_groups) the number of resulting groups
                int: (skip_key)
        '''
        factor_list, values_list = self.factorize_groupby_cols(groupby_cols)

        # create unique groups for groupby loop
        if len(factor_list) == 0:
            # no columns to groupby over, so directly aggregate the measure
            # columns to 1 total
            tmp_rootdir = self.create_tmp_rootdir()
            carray_factor = bcolz.zeros(len(self), dtype='int64', rootdir=tmp_rootdir, mode='w')
            carray_values = ['Total']
        elif len(factor_list) == 1:
            # single column groupby, the groupby output column
            # here is 1:1 to the values
            carray_factor = factor_list[0]
            carray_values = values_list[0]
        else:
            # multi column groupby
            # first combine the factorized columns to single values
            if self.group_cache_valid(col_list=groupby_cols):
                # there is a group cache that we can use
                col_rootdir = os.path.join(self.rootdir, self.create_group_base_name(groupby_cols))
                col_factor_rootdir = col_rootdir + '.factor'
                carray_factor = bcolz.carray(rootdir=col_factor_rootdir)
                col_values_rootdir = col_rootdir + '.values'
                carray_values = bcolz.carray(rootdir=col_values_rootdir)
            else:
                # create a brand new groupby col combination
                carray_factor, carray_values = \
                    self.create_group_column_factor(factor_list, groupby_cols, cache=self.auto_cache)

        nr_groups = len(carray_values)
        skip_key = None

        if bool_arr is not None:
            # make all non relevant combinations -1
            tmp_rootdir = self.create_tmp_rootdir()
            carray_factor = bcolz.eval(
                '(factor + 1) * bool - 1',
                user_dict={'factor': carray_factor, 'bool': bool_arr}, rootdir=tmp_rootdir, mode='w')
            # now check how many unique values there are left
            tmp_rootdir = self.create_tmp_rootdir()
            labels = bcolz.carray([], dtype='int64', expectedlen=len(carray_factor), rootdir=tmp_rootdir, mode='w')
            carray_factor, values = ctable_ext.factorize(carray_factor, labels)
            # values might contain one value too much (-1) (no direct lookup
            # possible because values is a reversed dict)
            filter_check = \
                [key for key, value in values.items() if value == -1]
            if filter_check:
                skip_key = filter_check[0]
            # the new nr of groups depends on the outcome after filtering
            nr_groups = len(values)

        # using nr_groups as a total length might be one one off due to the skip_key
        # (skipping a row in aggregation)
        # but that is okay normally

        if skip_key is None:
            # if we shouldn't skip a row, set it at the first row after the total number of groups
            skip_key = nr_groups

        return carray_factor, nr_groups, skip_key
Exemplo n.º 30
0
# # Benchmark to check the creation of an array of length > 2**32 (5e9)

import sys
from time import time

import bcolz

if sys.version_info >= (3,0):
    long = int


t0 = time()
#cn = bcolz.zeros(5e9, dtype="i1")
cn = bcolz.zeros(5e9, dtype="i1", rootdir='large_carray-bench', mode='w')
print("Creation time:", round(time() - t0, 3))
print("len:", len(cn))
assert len(cn) == int(5e9)

t0 = time()
cn = bcolz.carray(rootdir='large_carray-bench', mode='a')
print("Re-open time:", round(time() - t0, 3))
print("len(cn)", len(cn))
assert len(cn) == int(5e9)

# Now check some accesses
cn[1] = 1
assert cn[1] == 1
cn[int(2e9)] = 2
assert cn[int(2e9)] == 2
cn[long(3e9)] = 3
assert cn[long(3e9)] == 3
Exemplo n.º 31
0
    def create_group_column_factor(self, factor_list, groupby_cols, cache=False):
        """
        Create a unique, factorized column out of several individual columns

        Parameters
        ----------
        factor_list
        groupby_cols
        cache

        Returns
        -------

        """
        if not self.rootdir:
            # in-memory scenario
            input_rootdir = None
            col_rootdir = None
            col_factor_rootdir = None
            col_values_rootdir = None
            col_factor_rootdir_tmp = None
            col_values_rootdir_tmp = None
        else:
            # temporary
            input_rootdir = tempfile.mkdtemp(prefix='bcolz-')
            col_factor_rootdir_tmp = tempfile.mkdtemp(prefix='bcolz-')
            col_values_rootdir_tmp = tempfile.mkdtemp(prefix='bcolz-')

        # create combination of groupby columns
        group_array = bcolz.zeros(0, dtype=np.int64, expectedlen=len(self), rootdir=input_rootdir, mode='w')
        factor_table = bcolz.ctable(factor_list, names=groupby_cols)
        ctable_iter = factor_table.iter(outcols=groupby_cols, out_flavor=tuple)
        ctable_ext.create_group_index(ctable_iter, len(groupby_cols), group_array)

        # now factorize the results
        carray_factor = \
            bcolz.carray([], dtype='int64', expectedlen=self.size, rootdir=col_factor_rootdir_tmp, mode='w')
        carray_factor, values = ctable_ext.factorize(group_array, labels=carray_factor)
        carray_factor.flush()

        carray_values = \
            bcolz.carray(np.fromiter(values.values(), dtype=np.int64), rootdir=col_values_rootdir_tmp, mode='w')
        carray_values.flush()

        del group_array
        if cache:
            # clean up the temporary file
            rm_file_or_dir(input_rootdir, ignore_errors=True)

        if cache:
            # official end destination
            col_rootdir = os.path.join(self.rootdir, self.create_group_base_name(groupby_cols))
            col_factor_rootdir = col_rootdir + '.factor'
            col_values_rootdir = col_rootdir + '.values'
            lock_file = col_rootdir + '.lock'

            # only works for linux
            if not os.path.exists(lock_file):
                uid = str(uuid.uuid4())
                try:
                    with open(lock_file, 'a+') as fn:
                        fn.write(uid + '\n')
                    with open(lock_file, 'r') as fn:
                        temp = fn.read().splitlines()
                    if temp[0] == uid:
                        lock = True
                    else:
                        lock = False
                    del temp
                except:
                    lock = False
            else:
                lock = False

            if lock:
                rm_file_or_dir(col_factor_rootdir, ignore_errors=False)
                shutil.move(col_factor_rootdir_tmp, col_factor_rootdir)
                carray_factor = bcolz.carray(rootdir=col_factor_rootdir, mode='r')

                rm_file_or_dir(col_values_rootdir, ignore_errors=False)
                shutil.move(col_values_rootdir_tmp, col_values_rootdir)
                carray_values = bcolz.carray(rootdir=col_values_rootdir, mode='r')
            else:
                # another process has a lock, we will work with our current files and clean up later
                self._dir_clean_list.append(col_factor_rootdir)
                self._dir_clean_list.append(col_values_rootdir)

        return carray_factor, carray_values
Exemplo n.º 32
0
def _eval_blocks(expression, vars, vlen, typesize, vm, out_flavor, blen,
                 **kwargs):
    """Perform the evaluation in blocks."""

    if not blen:
        # Compute the optimal block size (in elements)
        # The next is based on experiments with bench/ctable-query.py
        # and the 'movielens-bench' repository
        if vm == "numexpr":
            bsize = 2**23
        elif vm == "dask":
            bsize = 2**25
        else:  # python
            bsize = 2**21
        blen = int(bsize / typesize)
        # Protection against too large atomsizes
        if blen == 0:
            blen = 1

    if vm == "dask":
        if 'da' in vars:
            raise NameError("'da' is reserved as a prefix for dask.array. "
                            "Please use another prefix")
        for name in vars:
            var = vars[name]
            if is_sequence_like(var):
                vars[name] = da.from_array(var,
                                           chunks=(blen, ) + var.shape[1:])
        # Build the expression graph
        vars['da'] = da
        da_expr = _eval(expression, vars)
        if out_flavor in ("bcolz", "carray") and da_expr.shape:
            result = bcolz.zeros(da_expr.shape, da_expr.dtype, **kwargs)
            # Store while compute expression graph
            da.store(da_expr, result)
            return result
        else:
            # Store while compute
            return np.array(da_expr)

    # Check whether we have a re_evaluate() function in numexpr
    re_evaluate = bcolz.numexpr_here and hasattr(bcolz.numexpr, "re_evaluate")

    vars_ = {}
    # Get containers for vars
    maxndims = 0
    for name in vars:
        var = vars[name]
        if is_sequence_like(var):
            ndims = len(var.shape) + len(var.dtype.shape)
            if ndims > maxndims:
                maxndims = ndims
            if len(var) > blen and hasattr(var, "_getrange"):
                shape = (blen, ) + var.shape[1:]
                vars_[name] = np.empty(shape, dtype=var.dtype)

    for i in xrange(0, vlen, blen):
        # Fill buffers for vars
        for name in vars:
            var = vars[name]
            if is_sequence_like(var) and len(var) > blen:
                if hasattr(var, "_getrange"):
                    if i + blen < vlen:
                        var._getrange(i, blen, vars_[name])
                    else:
                        vars_[name] = var[i:]
                else:
                    vars_[name] = var[i:i + blen]
            else:
                if hasattr(var, "__getitem__"):
                    vars_[name] = var[:]
                else:
                    vars_[name] = var

        # Perform the evaluation for this block
        if vm == "python":
            res_block = _eval(expression, vars_)
        else:
            if i == 0 or not re_evaluate:
                try:
                    res_block = bcolz.numexpr.evaluate(expression,
                                                       local_dict=vars_)
                except ValueError:
                    # numexpr cannot handle this, so fall back to "python" vm
                    warnings.warn(
                        "numexpr cannot handle this expression: falling back "
                        "to the 'python' virtual machine.  You can choose "
                        "another virtual machine by using the `vm` parameter.")
                    return _eval_blocks(expression, vars, vlen, typesize,
                                        "python", out_flavor, blen, **kwargs)
            else:
                res_block = bcolz.numexpr.re_evaluate(local_dict=vars_)

        if i == 0:
            # Detection of reduction operations
            scalar = False
            dim_reduction = False
            if len(res_block.shape) == 0:
                scalar = True
                result = res_block
                continue
            elif len(res_block.shape) < maxndims:
                dim_reduction = True
                result = res_block
                continue
            # Get a decent default for expectedlen
            if out_flavor in ("bcolz", "carray"):
                nrows = kwargs.pop('expectedlen', vlen)
                result = bcolz.carray(res_block, expectedlen=nrows, **kwargs)
            else:
                out_shape = list(res_block.shape)
                out_shape[0] = vlen
                result = np.empty(out_shape, dtype=res_block.dtype)
                result[:blen] = res_block
        else:
            if scalar or dim_reduction:
                result += res_block
            elif out_flavor in ("bcolz", "carray"):
                result.append(res_block)
            else:
                result[i:i + blen] = res_block

    if isinstance(result, bcolz.carray):
        result.flush()
    if scalar:
        return result[()]
    return result
Exemplo n.º 33
0
def _eval_blocks(expression, vars, vlen, typesize, vm, out_flavor, blen,
                 **kwargs):
    """Perform the evaluation in blocks."""

    if not blen:
        # Compute the optimal block size (in elements)
        # The next is based on experiments with bench/ctable-query.py
        # and the 'movielens-bench' repository
        if vm == "numexpr":
            bsize = 2**23
        elif vm == "dask":
            bsize = 2**25
        else:  # python
            bsize = 2**21
        blen = int(bsize / typesize)
        # Protection against too large atomsizes
        if blen == 0:
            blen = 1

    if vm == "dask":
        if 'da' in vars:
            raise NameError(
                "'da' is reserved as a prefix for dask.array. "
                "Please use another prefix")
        for name in vars:
            var = vars[name]
            if is_sequence_like(var):
                vars[name] = da.from_array(var, chunks=(blen,) + var.shape[1:])
        # Build the expression graph
        vars['da'] = da
        da_expr = _eval(expression, vars)
        if out_flavor in ("bcolz", "carray") and da_expr.shape:
            result = bcolz.zeros(da_expr.shape, da_expr.dtype, **kwargs)
            # Store while compute expression graph
            da.store(da_expr, result)
            return result
        else:
            # Store while compute
            return np.array(da_expr)

    # Check whether we have a re_evaluate() function in numexpr
    re_evaluate = bcolz.numexpr_here and hasattr(bcolz.numexpr, "re_evaluate")

    vars_ = {}
    # Get containers for vars
    maxndims = 0
    for name in vars:
        var = vars[name]
        if is_sequence_like(var):
            ndims = len(var.shape) + len(var.dtype.shape)
            if ndims > maxndims:
                maxndims = ndims
            if len(var) > blen and hasattr(var, "_getrange"):
                    shape = (blen, ) + var.shape[1:]
                    vars_[name] = np.empty(shape, dtype=var.dtype)

    for i in xrange(0, vlen, blen):
        # Fill buffers for vars
        for name in vars:
            var = vars[name]
            if is_sequence_like(var) and len(var) > blen:
                if hasattr(var, "_getrange"):
                    if i+blen < vlen:
                        var._getrange(i, blen, vars_[name])
                    else:
                        vars_[name] = var[i:]
                else:
                    vars_[name] = var[i:i+blen]
            else:
                if hasattr(var, "__getitem__"):
                    vars_[name] = var[:]
                else:
                    vars_[name] = var

        # Perform the evaluation for this block
        if vm == "python":
            res_block = _eval(expression, vars_)
        else:
            if i == 0 or not re_evaluate:
                try:
                    res_block = bcolz.numexpr.evaluate(expression,
                                                       local_dict=vars_)
                except ValueError:
                    # numexpr cannot handle this, so fall back to "python" vm
                    warnings.warn(
                        "numexpr cannot handle this expression: falling back "
                        "to the 'python' virtual machine.  You can choose "
                        "another virtual machine by using the `vm` parameter.")
                    return _eval_blocks(
                        expression, vars, vlen, typesize, "python",
                        out_flavor, blen, **kwargs)
            else:
                res_block = bcolz.numexpr.re_evaluate(local_dict=vars_)

        if i == 0:
            # Detection of reduction operations
            scalar = False
            dim_reduction = False
            if len(res_block.shape) == 0:
                scalar = True
                result = res_block
                continue
            elif len(res_block.shape) < maxndims:
                dim_reduction = True
                result = res_block
                continue
            # Get a decent default for expectedlen
            if out_flavor in ("bcolz", "carray"):
                nrows = kwargs.pop('expectedlen', vlen)
                result = bcolz.carray(res_block, expectedlen=nrows, **kwargs)
            else:
                out_shape = list(res_block.shape)
                out_shape[0] = vlen
                result = np.empty(out_shape, dtype=res_block.dtype)
                result[:blen] = res_block
        else:
            if scalar or dim_reduction:
                result += res_block
            elif out_flavor in ("bcolz", "carray"):
                result.append(res_block)
            else:
                result[i:i+blen] = res_block

    if isinstance(result, bcolz.carray):
        result.flush()
    if scalar:
        return result[()]
    return result