Пример #1
0
    def get_range(self, start="", finish="", columns=None, column_start="",
                  column_finish="", column_reversed=False, column_count=100,
                  row_count=None, include_timestamp=False,
                  super_column=None, read_consistency_level=None,
                  buffer_size=None, filter_empty=True, include_ttl=False,
                  start_token=None, finish_token=None):
        """
        Get an iterator over rows in a specified key range.

        The key range begins with `start` and ends with `finish`. If left
        as empty strings, these extend to the beginning and end, respectively.
        Note that if RandomPartitioner is used, rows are stored in the
        order of the MD5 hash of their keys, so getting a lexicographical range
        of keys is not feasible.

        In place of `start` and `finish`, you may use `start_token` and
        `finish_token` or a combination of `start` and `finish_token`.  In this
        case, you are specifying a token range to fetch instead of a key
        range.  This can be useful for fetching all data owned
        by a node or for parallelizing a full data set scan. Otherwise,
        you should typically just use `start` and `finish`.  Both `start_token`
        and `finish_token` must be specified as hex-encoded strings.

        The `row_count` parameter limits the total number of rows that may be
        returned. If left as ``None``, the number of rows that may be returned
        is unlimted (this is the default).

        When calling `get_range()`, the intermediate results need to be
        buffered if we are fetching many rows, otherwise the Cassandra
        server will overallocate memory and fail. `buffer_size` is the
        size of that buffer in number of rows. If left as ``None``, the
        ColumnFamily's :attr:`buffer_size` attribute will be used.

        When `filter_empty` is left as ``True``, empty rows (including
        `range ghosts <http://wiki.apache.org/cassandra/FAQ#range_ghosts>`_)
        will be skipped and will not count towards `row_count`.

        All other parameters are the same as those of :meth:`get()`.

        A generator over ``(key, {column_name: column_value})`` is returned.
        To convert this to a list, use ``list()`` on the result.

        """

        cl = read_consistency_level or self.read_consistency_level
        cp = self._column_parent(super_column)
        sp = self._slice_predicate(columns, column_start, column_finish,
                                   column_reversed, column_count, super_column)

        kr_args = {}
        count = 0
        i = 0

        if start_token is not None and (start not in ("", None) or finish not in ("", None)):
            raise ValueError(
                "ColumnFamily.get_range() received incompatible arguments: "
                "'start_token' may not be used with 'start' or 'finish'")

        if finish_token is not None and finish not in ("", None):
            raise ValueError(
                "ColumnFamily.get_range() received incompatible arguments: "
                "'finish_token' may not be used with 'finish'")

        if start_token is not None:
            kr_args['start_token'] = start_token
            kr_args['end_token'] = "" if finish_token is None else finish_token
        elif finish_token is not None:
            kr_args['start_key'] = self._pack_key(start)
            kr_args['end_token'] = finish_token
        else:
            kr_args['start_key'] = self._pack_key(start)
            kr_args['end_key'] = self._pack_key(finish)

        if buffer_size is None:
            buffer_size = self.buffer_size
        while True:
            if row_count is not None:
                if i == 0 and row_count <= buffer_size:
                    # We don't need to chunk, grab exactly the number of rows
                    buffer_size = row_count
                else:
                    buffer_size = min(row_count - count + 1, buffer_size)
            kr_args['count'] = buffer_size
            key_range = KeyRange(**kr_args)
            key_slices = self.pool.execute('get_range_slices', cp, sp, key_range, cl)
            # This may happen if nothing was ever inserted
            if key_slices is None:
                return
            for j, key_slice in enumerate(key_slices):
                # Ignore the first element after the first iteration
                # because it will be a duplicate.
                if j == 0 and i != 0:
                    continue
                if filter_empty and not key_slice.columns:
                    continue
                yield (self._unpack_key(key_slice.key),
                       self._cosc_to_dict(key_slice.columns, include_timestamp, include_ttl))
                count += 1
                if row_count is not None and count >= row_count:
                    return

            if len(key_slices) != buffer_size:
                return
            if 'start_token' in kr_args:
                del kr_args['start_token']
            kr_args['start_key'] = key_slices[-1].key
            i += 1
Пример #2
0
    def get_range(self,
                  start="",
                  finish="",
                  columns=None,
                  column_start="",
                  column_finish="",
                  column_reversed=False,
                  column_count=100,
                  row_count=None,
                  include_timestamp=False,
                  super_column=None,
                  read_consistency_level=None,
                  buffer_size=None):
        """
        Get an iterator over rows in a specified key range.

        The key range begins with `start` and ends with `finish`. If left
        as empty strings, these extend to the beginning and end, respectively.
        Note that if RandomPartitioner is used, rows are stored in the
        order of the MD5 hash of their keys, so getting a lexicographical range
        of keys is not feasible.

        The `row_count` parameter limits the total number of rows that may be
        returned. If left as ``None``, the number of rows that may be returned
        is unlimted (this is the default).

        When calling `get_range()`, the intermediate results need to be
        buffered if we are fetching many rows, otherwise the Cassandra
        server will overallocate memory and fail.  `buffer_size` is the size of
        that buffer in number of rows. If left as ``None``, the
        ColumnFamily's `buffer_size` attribute will be used.

        All other parameters are the same as those of :meth:`get()`.

        A generator over ``(key, {column_name: column_value})`` is returned.
        To convert this to a list, use ``list()`` on the result.

        """

        cp = self._create_column_parent(super_column)
        sp = self._create_slice_predicate(columns, column_start, column_finish,
                                          column_reversed, column_count)

        count = 0
        i = 0
        last_key = start

        if buffer_size is None:
            buffer_size = self.buffer_size
        while True:
            if row_count is not None:
                buffer_size = min(row_count - count + 1, buffer_size)
            key_range = KeyRange(start_key=last_key,
                                 end_key=finish,
                                 count=buffer_size)
            try:
                self._obtain_connection()
                key_slices = self._tlocal.client.get_range_slices(
                    cp, sp, key_range, self._rcl(read_consistency_level))
            finally:
                self._release_connection()
            # This may happen if nothing was ever inserted
            if key_slices is None:
                return
            for j, key_slice in enumerate(key_slices):
                # Ignore the first element after the first iteration
                # because it will be a duplicate.
                if j == 0 and i != 0:
                    continue
                yield (key_slice.key,
                       self._convert_ColumnOrSuperColumns_to_dict_class(
                           key_slice.columns, include_timestamp))
                count += 1
                if row_count is not None and count >= row_count:
                    return

            if len(key_slices) != buffer_size:
                return
            last_key = key_slices[-1].key
            i += 1
Пример #3
0
    def get_range(self,
                  start="",
                  finish="",
                  columns=None,
                  column_start="",
                  column_finish="",
                  column_reversed=False,
                  column_count=100,
                  row_count=None,
                  include_timestamp=False,
                  super_column=None,
                  read_consistency_level=None):
        """
        Get an iterator over keys in a specified range

        :Parameters:
            `start`: str
                Start from this key (inclusive)
            `finish`: str
                End at this key (inclusive)
            `columns`: [str]
                Limit the columns or super_columns fetched to the specified list
            `column_start`: str
                Only fetch when a column or super_column is >= column_start
            `column_finish`: str
                Only fetch when a column or super_column is <= column_finish
            `column_reversed`: bool
                Fetch the columns or super_columns in reverse order. This will do
                nothing unless you passed a dict_class to the constructor.
            `column_count`: int
                Limit the number of columns or super_columns fetched per key
            `row_count`: int
                Limit the number of rows fetched
            `include_timestamp`: bool
                If true, return a (value, timestamp) tuple for each column
            `super_column`: string
                Return columns only in this super_column
            `read_consistency_level`: :class:`pycassa.cassandra.ttypes.ConsistencyLevel`
                Affects the guaranteed replication factor before returning from
                any read operation

        :Returns:
            iterator over ('key', {'column': 'value'})
        """

        (super_column, column_start,
         column_finish) = self._pack_slice_cols(super_column, column_start,
                                                column_finish)

        packed_cols = None
        if columns is not None:
            packed_cols = []
            for col in columns:
                packed_cols.append(
                    self._pack_name(col, is_supercol_name=self.super))

        cp = ColumnParent(column_family=self.column_family,
                          super_column=super_column)
        sp = create_SlicePredicate(packed_cols, column_start, column_finish,
                                   column_reversed, column_count)

        count = 0
        i = 0
        last_key = start

        buffer_size = self.buffer_size
        if row_count is not None:
            buffer_size = min(row_count, self.buffer_size)
        while True:
            key_range = KeyRange(start_key=last_key,
                                 end_key=finish,
                                 count=buffer_size)
            key_slices = self.client.get_range_slices(
                cp, sp, key_range, self._rcl(read_consistency_level))
            # This may happen if nothing was ever inserted
            if key_slices is None:
                return
            for j, key_slice in enumerate(key_slices):
                # Ignore the first element after the first iteration
                # because it will be a duplicate.
                if j == 0 and i != 0:
                    continue
                yield (key_slice.key,
                       self._convert_ColumnOrSuperColumns_to_dict_class(
                           key_slice.columns, include_timestamp))
                count += 1
                if row_count is not None and count >= row_count:
                    return

            if len(key_slices) != self.buffer_size:
                return
            last_key = key_slices[-1].key
            i += 1
Пример #4
0
    def get_range(self,
                  start="",
                  finish="",
                  columns=None,
                  column_start="",
                  column_finish="",
                  column_reversed=False,
                  column_count=100,
                  row_count=None,
                  include_timestamp=False,
                  super_column=None,
                  read_consistency_level=None,
                  buffer_size=None,
                  filter_empty=True,
                  include_ttl=False):
        """
        Get an iterator over rows in a specified key range.

        The key range begins with `start` and ends with `finish`. If left
        as empty strings, these extend to the beginning and end, respectively.
        Note that if RandomPartitioner is used, rows are stored in the
        order of the MD5 hash of their keys, so getting a lexicographical range
        of keys is not feasible.

        The `row_count` parameter limits the total number of rows that may be
        returned. If left as ``None``, the number of rows that may be returned
        is unlimted (this is the default).

        When calling `get_range()`, the intermediate results need to be
        buffered if we are fetching many rows, otherwise the Cassandra
        server will overallocate memory and fail. `buffer_size` is the
        size of that buffer in number of rows. If left as ``None``, the
        ColumnFamily's :attr:`buffer_size` attribute will be used.

        When `filter_empty` is left as ``True``, empty rows (including
        `range ghosts <http://wiki.apache.org/cassandra/FAQ#range_ghosts>`_)
        will be skipped and will not count towards `row_count`.

        All other parameters are the same as those of :meth:`get()`.

        A generator over ``(key, {column_name: column_value})`` is returned.
        To convert this to a list, use ``list()`` on the result.

        """

        cl = read_consistency_level or self.read_consistency_level
        cp = self._column_parent(super_column)
        sp = self._slice_predicate(columns, column_start, column_finish,
                                   column_reversed, column_count, super_column)

        count = 0
        i = 0
        last_key = self._pack_key(start)
        finish = self._pack_key(finish)

        if buffer_size is None:
            buffer_size = self.buffer_size
        while True:
            if row_count is not None:
                if i == 0 and row_count <= buffer_size:
                    # We don't need to chunk, grab exactly the number of rows
                    buffer_size = row_count
                else:
                    buffer_size = min(row_count - count + 1, buffer_size)

            key_range = KeyRange(start_key=last_key,
                                 end_key=finish,
                                 count=buffer_size)
            key_slices = self.pool.execute('get_range_slices', cp, sp,
                                           key_range, cl)
            # This may happen if nothing was ever inserted
            if key_slices is None:
                return
            for j, key_slice in enumerate(key_slices):
                # Ignore the first element after the first iteration
                # because it will be a duplicate.
                if j == 0 and i != 0:
                    continue
                if filter_empty and not key_slice.columns:
                    continue
                yield (self._unpack_key(key_slice.key),
                       self._cosc_to_dict(key_slice.columns, include_timestamp,
                                          include_ttl))
                count += 1
                if row_count is not None and count >= row_count:
                    return

            if len(key_slices) != buffer_size:
                return
            last_key = key_slices[-1].key
            i += 1