def get_range(self, start="", finish="", columns=None, column_start="", column_finish="", column_reversed=False, column_count=100, row_count=None, include_timestamp=False, super_column=None, read_consistency_level=None, buffer_size=None, filter_empty=True, include_ttl=False, start_token=None, finish_token=None): """ Get an iterator over rows in a specified key range. The key range begins with `start` and ends with `finish`. If left as empty strings, these extend to the beginning and end, respectively. Note that if RandomPartitioner is used, rows are stored in the order of the MD5 hash of their keys, so getting a lexicographical range of keys is not feasible. In place of `start` and `finish`, you may use `start_token` and `finish_token` or a combination of `start` and `finish_token`. In this case, you are specifying a token range to fetch instead of a key range. This can be useful for fetching all data owned by a node or for parallelizing a full data set scan. Otherwise, you should typically just use `start` and `finish`. Both `start_token` and `finish_token` must be specified as hex-encoded strings. The `row_count` parameter limits the total number of rows that may be returned. If left as ``None``, the number of rows that may be returned is unlimted (this is the default). When calling `get_range()`, the intermediate results need to be buffered if we are fetching many rows, otherwise the Cassandra server will overallocate memory and fail. `buffer_size` is the size of that buffer in number of rows. If left as ``None``, the ColumnFamily's :attr:`buffer_size` attribute will be used. When `filter_empty` is left as ``True``, empty rows (including `range ghosts <http://wiki.apache.org/cassandra/FAQ#range_ghosts>`_) will be skipped and will not count towards `row_count`. All other parameters are the same as those of :meth:`get()`. A generator over ``(key, {column_name: column_value})`` is returned. To convert this to a list, use ``list()`` on the result. """ cl = read_consistency_level or self.read_consistency_level cp = self._column_parent(super_column) sp = self._slice_predicate(columns, column_start, column_finish, column_reversed, column_count, super_column) kr_args = {} count = 0 i = 0 if start_token is not None and (start not in ("", None) or finish not in ("", None)): raise ValueError( "ColumnFamily.get_range() received incompatible arguments: " "'start_token' may not be used with 'start' or 'finish'") if finish_token is not None and finish not in ("", None): raise ValueError( "ColumnFamily.get_range() received incompatible arguments: " "'finish_token' may not be used with 'finish'") if start_token is not None: kr_args['start_token'] = start_token kr_args['end_token'] = "" if finish_token is None else finish_token elif finish_token is not None: kr_args['start_key'] = self._pack_key(start) kr_args['end_token'] = finish_token else: kr_args['start_key'] = self._pack_key(start) kr_args['end_key'] = self._pack_key(finish) if buffer_size is None: buffer_size = self.buffer_size while True: if row_count is not None: if i == 0 and row_count <= buffer_size: # We don't need to chunk, grab exactly the number of rows buffer_size = row_count else: buffer_size = min(row_count - count + 1, buffer_size) kr_args['count'] = buffer_size key_range = KeyRange(**kr_args) key_slices = self.pool.execute('get_range_slices', cp, sp, key_range, cl) # This may happen if nothing was ever inserted if key_slices is None: return for j, key_slice in enumerate(key_slices): # Ignore the first element after the first iteration # because it will be a duplicate. if j == 0 and i != 0: continue if filter_empty and not key_slice.columns: continue yield (self._unpack_key(key_slice.key), self._cosc_to_dict(key_slice.columns, include_timestamp, include_ttl)) count += 1 if row_count is not None and count >= row_count: return if len(key_slices) != buffer_size: return if 'start_token' in kr_args: del kr_args['start_token'] kr_args['start_key'] = key_slices[-1].key i += 1
def get_range(self, start="", finish="", columns=None, column_start="", column_finish="", column_reversed=False, column_count=100, row_count=None, include_timestamp=False, super_column=None, read_consistency_level=None, buffer_size=None): """ Get an iterator over rows in a specified key range. The key range begins with `start` and ends with `finish`. If left as empty strings, these extend to the beginning and end, respectively. Note that if RandomPartitioner is used, rows are stored in the order of the MD5 hash of their keys, so getting a lexicographical range of keys is not feasible. The `row_count` parameter limits the total number of rows that may be returned. If left as ``None``, the number of rows that may be returned is unlimted (this is the default). When calling `get_range()`, the intermediate results need to be buffered if we are fetching many rows, otherwise the Cassandra server will overallocate memory and fail. `buffer_size` is the size of that buffer in number of rows. If left as ``None``, the ColumnFamily's `buffer_size` attribute will be used. All other parameters are the same as those of :meth:`get()`. A generator over ``(key, {column_name: column_value})`` is returned. To convert this to a list, use ``list()`` on the result. """ cp = self._create_column_parent(super_column) sp = self._create_slice_predicate(columns, column_start, column_finish, column_reversed, column_count) count = 0 i = 0 last_key = start if buffer_size is None: buffer_size = self.buffer_size while True: if row_count is not None: buffer_size = min(row_count - count + 1, buffer_size) key_range = KeyRange(start_key=last_key, end_key=finish, count=buffer_size) try: self._obtain_connection() key_slices = self._tlocal.client.get_range_slices( cp, sp, key_range, self._rcl(read_consistency_level)) finally: self._release_connection() # This may happen if nothing was ever inserted if key_slices is None: return for j, key_slice in enumerate(key_slices): # Ignore the first element after the first iteration # because it will be a duplicate. if j == 0 and i != 0: continue yield (key_slice.key, self._convert_ColumnOrSuperColumns_to_dict_class( key_slice.columns, include_timestamp)) count += 1 if row_count is not None and count >= row_count: return if len(key_slices) != buffer_size: return last_key = key_slices[-1].key i += 1
def get_range(self, start="", finish="", columns=None, column_start="", column_finish="", column_reversed=False, column_count=100, row_count=None, include_timestamp=False, super_column=None, read_consistency_level=None): """ Get an iterator over keys in a specified range :Parameters: `start`: str Start from this key (inclusive) `finish`: str End at this key (inclusive) `columns`: [str] Limit the columns or super_columns fetched to the specified list `column_start`: str Only fetch when a column or super_column is >= column_start `column_finish`: str Only fetch when a column or super_column is <= column_finish `column_reversed`: bool Fetch the columns or super_columns in reverse order. This will do nothing unless you passed a dict_class to the constructor. `column_count`: int Limit the number of columns or super_columns fetched per key `row_count`: int Limit the number of rows fetched `include_timestamp`: bool If true, return a (value, timestamp) tuple for each column `super_column`: string Return columns only in this super_column `read_consistency_level`: :class:`pycassa.cassandra.ttypes.ConsistencyLevel` Affects the guaranteed replication factor before returning from any read operation :Returns: iterator over ('key', {'column': 'value'}) """ (super_column, column_start, column_finish) = self._pack_slice_cols(super_column, column_start, column_finish) packed_cols = None if columns is not None: packed_cols = [] for col in columns: packed_cols.append( self._pack_name(col, is_supercol_name=self.super)) cp = ColumnParent(column_family=self.column_family, super_column=super_column) sp = create_SlicePredicate(packed_cols, column_start, column_finish, column_reversed, column_count) count = 0 i = 0 last_key = start buffer_size = self.buffer_size if row_count is not None: buffer_size = min(row_count, self.buffer_size) while True: key_range = KeyRange(start_key=last_key, end_key=finish, count=buffer_size) key_slices = self.client.get_range_slices( cp, sp, key_range, self._rcl(read_consistency_level)) # This may happen if nothing was ever inserted if key_slices is None: return for j, key_slice in enumerate(key_slices): # Ignore the first element after the first iteration # because it will be a duplicate. if j == 0 and i != 0: continue yield (key_slice.key, self._convert_ColumnOrSuperColumns_to_dict_class( key_slice.columns, include_timestamp)) count += 1 if row_count is not None and count >= row_count: return if len(key_slices) != self.buffer_size: return last_key = key_slices[-1].key i += 1
def get_range(self, start="", finish="", columns=None, column_start="", column_finish="", column_reversed=False, column_count=100, row_count=None, include_timestamp=False, super_column=None, read_consistency_level=None, buffer_size=None, filter_empty=True, include_ttl=False): """ Get an iterator over rows in a specified key range. The key range begins with `start` and ends with `finish`. If left as empty strings, these extend to the beginning and end, respectively. Note that if RandomPartitioner is used, rows are stored in the order of the MD5 hash of their keys, so getting a lexicographical range of keys is not feasible. The `row_count` parameter limits the total number of rows that may be returned. If left as ``None``, the number of rows that may be returned is unlimted (this is the default). When calling `get_range()`, the intermediate results need to be buffered if we are fetching many rows, otherwise the Cassandra server will overallocate memory and fail. `buffer_size` is the size of that buffer in number of rows. If left as ``None``, the ColumnFamily's :attr:`buffer_size` attribute will be used. When `filter_empty` is left as ``True``, empty rows (including `range ghosts <http://wiki.apache.org/cassandra/FAQ#range_ghosts>`_) will be skipped and will not count towards `row_count`. All other parameters are the same as those of :meth:`get()`. A generator over ``(key, {column_name: column_value})`` is returned. To convert this to a list, use ``list()`` on the result. """ cl = read_consistency_level or self.read_consistency_level cp = self._column_parent(super_column) sp = self._slice_predicate(columns, column_start, column_finish, column_reversed, column_count, super_column) count = 0 i = 0 last_key = self._pack_key(start) finish = self._pack_key(finish) if buffer_size is None: buffer_size = self.buffer_size while True: if row_count is not None: if i == 0 and row_count <= buffer_size: # We don't need to chunk, grab exactly the number of rows buffer_size = row_count else: buffer_size = min(row_count - count + 1, buffer_size) key_range = KeyRange(start_key=last_key, end_key=finish, count=buffer_size) key_slices = self.pool.execute('get_range_slices', cp, sp, key_range, cl) # This may happen if nothing was ever inserted if key_slices is None: return for j, key_slice in enumerate(key_slices): # Ignore the first element after the first iteration # because it will be a duplicate. if j == 0 and i != 0: continue if filter_empty and not key_slice.columns: continue yield (self._unpack_key(key_slice.key), self._cosc_to_dict(key_slice.columns, include_timestamp, include_ttl)) count += 1 if row_count is not None and count >= row_count: return if len(key_slices) != buffer_size: return last_key = key_slices[-1].key i += 1