def as_array(self, fields=None): if fields is None: fields = self.fields # csv file is assumed to be in the correct order (ie by period then id) datastream = self.read(fields) return fromiter(datastream, dtype=np.dtype(fields), count=self.numlines)
def stream_to_table(h5file, node, name, fields, datastream, numlines=None, title=None, invert=(), buffersize=10 * 2**20, compression=None): # make sure datastream is an iterator, not a list, otherwise it could # loop indefinitely as it will never be consumed. # Note that, contrary to what I thought, we shouldn't make a special case # for that as np.fromiter(islice(iter(l), max_rows)) is faster than # np.array(l[:max_rows]) datastream = iter(datastream) msg, filters = compression_str2filter(compression) print(" - storing %s..." % msg) dtype = np.dtype(fields) table = h5file.create_table(node, name, dtype, title=title, filters=filters) # buffered load max_buffer_rows = buffersize // dtype.itemsize while True: dataslice = islice(datastream, max_buffer_rows) if numlines is not None: if numlines <= 0: break buffer_rows = min(numlines, max_buffer_rows) # ideally, we should preallocate an empty buffer and reuse it, # but that does not seem to be supported by numpy array = fromiter(dataslice, dtype=dtype, count=buffer_rows) numlines -= buffer_rows else: array = fromiter(dataslice, dtype=dtype) if not len(array): break for field in invert: array[field] = ~array[field] table.append(array) table.flush() return table
def stream_to_table( h5file, node, name, fields, datastream, numlines=None, title=None, invert=(), buffersize=10 * 2 ** 20, compression=None, ): # make sure datastream is an iterator, not a list, otherwise it could # loop indefinitely as it will never be consumed. # Note that, contrary to what I thought, we shouldn't make a special case # for that as np.fromiter(islice(iter(l), max_rows)) is faster than # np.array(l[:max_rows]) datastream = iter(datastream) msg, filters = compression_str2filter(compression) print " - storing %s..." % msg dtype = np.dtype(fields) table = h5file.createTable(node, name, dtype, title=title, filters=filters) # buffered load max_buffer_rows = buffersize / dtype.itemsize while True: dataslice = islice(datastream, max_buffer_rows) if numlines is not None: if numlines <= 0: break buffer_rows = min(numlines, max_buffer_rows) # ideally, we should preallocate an empty buffer and reuse it, # but that does not seem to be supported by numpy array = fromiter(dataslice, dtype=dtype, count=buffer_rows) numlines -= buffer_rows else: array = fromiter(dataslice, dtype=dtype) if not len(array): break for field in invert: array[field] = ~array[field] table.append(array) table.flush() return table
def stream_to_array(fields, datastream, numlines=None, invert=()): # make sure datastream is an iterator, not a list, otherwise it could # loop indefinitely as it will never be consumed. # Note that, contrary to what I thought, we shouldn't make a special case # for that as np.fromiter(islice(iter(l), max_rows)) is faster than # np.array(l[:max_rows]) datastream = iter(datastream) dtype = np.dtype(fields) count = -1 if numlines is None else numlines array = fromiter(datastream, dtype=dtype, count=count) for field in invert: array[field] = ~array[field] return array