Пример #1
0
    def __init__(self,
                 arg1,
                 shape=None,
                 filename="sparse.spy",
                 tablename="dok_matrix",
                 dtype=None,
                 copy=False,
                 commit_freq=1.0):
        spmatrix.__init__(self)

        self.dtype = getdtype(dtype, default=float)
        if isinstance(arg1, tuple) and isshape(arg1):  # (M,N)
            M, N = arg1
            self.shape = (M, N)
        elif isspmatrix(arg1):  # Sparse ctor
            if isspmatrix_dok(arg1) and copy:
                arg1 = arg1.copy()
            else:
                arg1 = arg1.todok()

            if dtype is not None:
                arg1 = arg1.astype(dtype)

            self.shape = arg1.shape
            self.dtype = arg1.dtype
        else:  # Dense ctor
            try:
                arg1 = np.asarray(arg1)
            except:
                raise TypeError('invalid input format')

            if len(arg1.shape) != 2:
                raise TypeError('expected rank <=2 dense array or matrix')

            from scipy.sparse.coo import coo_matrix
            d = coo_matrix(arg1, dtype=dtype).todok()
            self.shape = arg1.shape
            self.dtype = d.dtype

        ddict.__init__(self,
                       filename,
                       tablename=tablename,
                       commit_freq=commit_freq,
                       key_types=("UNSIGNED INTEGER",
                                  int_tuple_ser(*self.shape),
                                  int_tuple_unser(*self.shape)),
                       val_types=("REAL", float, float))

        if isspmatrix(arg1):  # Sparse ctor
            ddict.update(self, arg1)
        elif not (isinstance(arg1, tuple) and isshape(arg1)):
            ddict.update(self, d)
Пример #2
0
    def test_getdtype(self):
        A = np.array([1], dtype='int8')

        assert_equal(sputils.getdtype(None, default=float), float)
        assert_equal(sputils.getdtype(None, a=A), np.int8)
Пример #3
0
    def test_getdtype(self):
        A = np.array([1],dtype='int8')

        assert_equal(sputils.getdtype(None,default=float),np.float)
        assert_equal(sputils.getdtype(None,a=A),np.int8)
Пример #4
0
    def __init__(self, arg1, shape=None, dtype=None, copy=False):
        _data_matrix.__init__(self)

        if isspmatrix(arg1):
            if arg1.format == self.format and copy:
                arg1 = arg1.copy()
            else:
                arg1 = arg1.asformat(self.format)
            self._set_self(arg1)

        elif isinstance(arg1, tuple):
            if isshape(arg1):
                # It's a tuple of matrix dimensions (M, N)
                # create empty matrix
                self._shape = check_shape(arg1)
                M, N = self.shape
                # Select index dtype large enough to pass array and
                # scalar parameters to sparsetools
                idx_dtype = get_index_dtype(maxval=max(M, N))
                self.data = np.zeros(0, getdtype(dtype, default=float))
                self.indices = np.zeros(0, idx_dtype)
                self.indptr = np.zeros(self._swap((M, N))[0] + 1,
                                       dtype=idx_dtype)
            else:
                if len(arg1) == 2:
                    # (data, ij) format
                    from scipy.sparse.coo import coo_matrix
                    other = self.__class__(coo_matrix(arg1, shape=shape))
                    self._set_self(other)
                elif len(arg1) == 3:
                    # (data, indices, indptr) format
                    (data, indices, indptr) = arg1

                    # Select index dtype large enough to pass array and
                    # scalar parameters to sparsetools
                    maxval = None
                    if shape is not None:
                        maxval = max(shape)
                    idx_dtype = get_index_dtype((indices, indptr),
                                                maxval=maxval,
                                                check_contents=True)

                    self.indices = np.array(indices,
                                            copy=copy,
                                            dtype=idx_dtype)
                    self.indptr = np.array(indptr, copy=copy, dtype=idx_dtype)
                    self.data = np.array(data, copy=copy, dtype=dtype)
                else:
                    raise ValueError("unrecognized {}_matrix "
                                     "constructor usage".format(self.format))

        else:
            # must be dense
            try:
                arg1 = np.asarray(arg1)
            except Exception:
                raise ValueError("unrecognized {}_matrix constructor usage"
                                 "".format(self.format))
            from scipy.sparse.coo import coo_matrix
            self._set_self(self.__class__(coo_matrix(arg1, dtype=dtype)))

        # Read matrix dimensions given, if any
        if shape is not None:
            self._shape = check_shape(shape)
        else:
            if self.shape is None:
                # shape not already set, try to infer dimensions
                try:
                    major_dim = len(self.indptr) - 1
                    minor_dim = self.indices.max() + 1
                except Exception:
                    raise ValueError('unable to infer matrix dimensions')
                else:
                    self._shape = check_shape(
                        self._swap((major_dim, minor_dim)))

        if dtype is not None:
            self.data = self.data.astype(dtype, copy=False)

        self.check_format(full_check=False)
Пример #5
0
    def __init__(self, arg1, shape=None, dtype=None, copy=False):
        _data_matrix.__init__(self)

        if isspmatrix(arg1):
            if arg1.format == self.format and copy:
                arg1 = arg1.copy()
            else:
                arg1 = arg1.asformat(self.format)
            self._set_self(arg1)

        elif isinstance(arg1, tuple):
            if isshape(arg1):
                # It's a tuple of matrix dimensions (M, N)
                # create empty matrix
                self.shape = arg1   # spmatrix checks for errors here
                M, N = self.shape
                idx_dtype = get_index_dtype(maxval=self._swap((M,N))[1])
                self.data = da.zeros(0, getdtype(dtype, default=float))
                self.indices = da.zeros(0, idx_dtype)
                self.indptr = da.zeros(self._swap((M,N))[0] + 1, dtype=idx_dtype)
            else:
                if len(arg1) == 2:
                    # (data, ij) format
                    from .coo import coo_matrix
                    other = self.__class__(coo_matrix(arg1, shape=shape))
                    self._set_self(other)
                elif len(arg1) == 3:
                    # (data, indices, indptr) format
                    (data, indices, indptr) = arg1
                    idx_dtype = get_index_dtype((indices, indptr), check_contents=True)
                    chunks = (10,)
                    self.indices = da.from_array(indices, chunks=chunks)
                    self.indptr = da.from_array(indptr, chunks=chunks)
                    self.data = da.from_array(data, chunks=chunks)
                else:
                    raise ValueError("unrecognized %s_matrix constructor usage" %
                            self.format)

        else:
            # must be dense
            try:
                arg1 = np.asarray(arg1)
            except:
                raise ValueError("unrecognized %s_matrix constructor usage" %
                        self.format)
            from scipy.sparse.coo import coo_matrix
            self._set_self(self.__class__(coo_matrix(arg1, dtype=dtype)))

        # Read matrix dimensions given, if any
        if shape is not None:
            self.shape = shape   # spmatrix will check for errors
        else:
            if self.shape is None:
                # shape not already set, try to infer dimensions
                try:
                    major_dim = len(self.indptr) - 1
                    minor_dim = self.indices.max() + 1
                except:
                    raise ValueError('unable to infer matrix dimensions')
                else:
                    self.shape = self._swap((major_dim,minor_dim))

        if dtype is not None:
            self.data = np.asarray(self.data, dtype=dtype)

        self.check_format(full_check=False)
Пример #6
0
    def __init__(self, arg1, shape=None, dtype=None, copy=False):
        _data_matrix.__init__(self)

        self.chunks = (10, 1)

        if isinstance(arg1, tuple):
            if isshape(arg1):
                M, N = arg1
                self.shape = (M, N)
                idx_dtype = get_index_dtype(maxval=max(M, N))
                self.row = np.array([], dtype=idx_dtype)
                self.col = np.array([], dtype=idx_dtype)
                self.data = np.array([], getdtype(dtype, default=float))
                self.has_canonical_format = True
            else:
                try:
                    obj, (row, col) = arg1
                except (TypeError, ValueError):
                    raise TypeError('invalid input format')

                if shape is None:
                    if len(row) == 0 or len(col) == 0:
                        raise ValueError('cannot infer dimensions from zero '
                                         'sized index arrays')
                    M = np.max(row) + 1
                    N = np.max(col) + 1
                    self.shape = (M, N)
                else:
                    # Use 2 steps to ensure shape has length 2.
                    M, N = shape
                    self.shape = (M, N)

                idx_dtype = get_index_dtype(maxval=max(self.shape))
                if isinstance(row, da.core.Array):
                    self.row = row
                else:
                    self.row = da.from_array(row, chunks=self.chunks)
                if isinstance(col, da.core.Array):
                    self.col = col
                else:
                    self.col = da.from_array(col, chunks=self.chunks)
                if isinstance(obj, da.core.Array):
                    self.data = obj
                else:
                    self.data = da.from_array(obj, chunks=self.chunks)

                self.has_canonical_format = False

        else:
            if isspmatrix(arg1):
                if isspmatrix_coo(arg1) and copy:
                    self.row = arg1.row.copy()
                    self.col = arg1.col.copy()
                    self.data = arg1.data.copy()
                    self.shape = arg1.shape
                else:
                    coo = arg1.tocoo()
                    self.row = coo.row
                    self.col = coo.col
                    self.data = coo.data
                    self.shape = coo.shape
                self.has_canonical_format = False
            else:
                #dense argument
                M = np.atleast_2d(np.asarray(arg1))

                if M.ndim != 2:
                    raise TypeError('expected dimension <= 2 array or matrix')
                else:
                    self.shape = M.shape

                self.row, self.col = M.nonzero()
                self.data = M[self.row, self.col]
                self.has_canonical_format = True

        if dtype is not None:
            self.data = self.data.astype(dtype)

        self._check()
Пример #7
0
    def __init__(self,
                 arg1,
                 block_size=None,
                 n_samples=None,
                 n_history=None,
                 shape=None,
                 dtype=None,
                 copy=False):
        _data_matrix.__init__(self)

        # case 1: instantiate from another sparse matrix
        if isspmatrix(arg1):
            if arg1.format == self.format:
                self._set_self(arg1)
            elif arg1.format == "csr":
                self._csr_to_delta_csr(arg1, block_size, n_samples, n_history)
            else:
                raise NotImplementedError(
                    "Instantiation from sparse matrix not yet ready")

        # case 2: instantiate from some kind of raw data
        elif isinstance(arg1, tuple):
            if isshape(arg1):
                # input is size specification (M,N) for empty matrix
                # code mostly taken from scipy CSR implementation, other than an
                # additional line to instantiate deltas array
                self.shape = arg1
                M, N = self.shape
                idx_dtype = get_index_dtype(maxval=max(M, N))
                self.data = np.zeros(0, getdtype(dtype, default='float'))
                self.indices = np.zeros(0, idx_dtype)
                self.indptr = np.zeros(self._swap((M, N))[0] + 1,
                                       dtype=idx_dtype)
                self.deltas = np.zeros(0, dtype=idx_dtype)
            else:
                if len(arg1) == 2:
                    # COO data format
                    raise NotImplementedError(
                        "Instantiation from COO format not yet ready")
                elif len(arg1) == 3 or len(arg1) == 4:
                    # contents of the tuple are the raw data structures
                    (self.data, self.indices, self.indptr) = arg1[:3]
                    # use given shape or automatically infer one
                    if shape is not None:
                        self.shape = shape
                    else:
                        M = indptr.shape[0] - 1
                        N = np.max(indices)
                        self.shape = (M, N)
                    # a fourth array, for the deltas pointer, should always be
                    # given in general use, but we also allow for the case where
                    # it is omitted in order to maintain backwards compatibility
                    # with superclass methods. In this case we just let each
                    # deltas[i] = i; in other words treating this matrix as a
                    # standard CSR matrix with no delta encoding
                    self.deltas = arg1[3] if len(arg1) > 3 else np.arange(
                        self.shape[0])

        # case 3: instantiate from generator object
        elif isinstance(arg1, types.GeneratorType):
            self._construct_from_iterable(arg1, getdtype(dtype,
                                                         default='float'),
                                          np.int32, block_size, n_samples,
                                          shape)

        # case 4: instantiate from dense matrix / array
        else:
            try:
                arg1 = np.asarray(arg1)
            except:
                raise ValueError(
                    "unrecognized delta_csr_matrix constructor usage")
            # create a generator expression for iterating over rows of arg1
            row_gen = (arg1[i, :] for i in range(arg1.shape[0]))
            self._construct_from_iterable(
                row_gen,
                arg1.dtype,
                get_index_dtype(maxval=max(*arg1.shape)),
                block_size,
                n_samples,
                shape=arg1.shape)

        self.check_format(full_check=False)
Пример #8
0
    def _construct_from_iterable(self,
                                 rows,
                                 dtype,
                                 idx_dtype,
                                 block_size=None,
                                 n_samples=None,
                                 shape=None,
                                 data_size=10000):
        """
        Build a delta encoded sparse matrix row-by-row from an iterable of
        rows (e.g. a dense matrix or a CSR matrix)
        """

        # data structures are initially empty
        self.data = np.zeros(data_size, getdtype(dtype, default='float'))
        self.indices = np.zeros(data_size, idx_dtype)
        self.indptr = np.zeros(10, dtype=idx_dtype)
        self.deltas = np.zeros(10, dtype=idx_dtype)

        # keep track of which rows have been used as reference rows already
        reference_rows = {}

        M, N = shape if shape is not None else (None, None)

        # keep track of how many rows we have added thus far
        num_rows_added = 0
        # keep track of how large the data array currently is
        data_added = 0

        # use a HashSimilarityDetector to locate reference rows
        if block_size is None:
            block_size = N // 10
        sd = HashSimilarityDetector(block_size, n_samples)

        for row in rows:
            # from the first row we can infer the number of columns
            if N is None:
                N = row.shape[-1]
            # all rows should be the same length
            if row.shape[-1] != N:
                raise ValueError(
                    "Inconsistent row sizes passed (expected %d, got %d)" %
                    (N, row.shape[-1]))
            # before we start: we will have to update self.deltas at some point,
            # which being dynamically allocated may not have enough allocated
            # space to update. As such, we should expand it if necessary
            if self.deltas.shape[0] < num_rows_added + 1:
                self.deltas.resize(self.deltas.shape[0] * 2)

            # use the HashSimilarityDetector to locate a row sufficiently
            # similar to this one to serve as a reference row
            # first, represent the row as a string:
            row_str = vec_to_str(row)
            # then look up a match
            ref = sd.get_best_match(row_str)

            # if no match was found, store the row directly
            if ref == -1:
                data_added += self._append_row_data(row, num_rows_added,
                                                    data_added)
                # update self.deltas so that this row points to itself
                self.deltas[num_rows_added] = num_rows_added
                # since this row was added directly, we can consider it as a
                # candidate for a reference row in the future
                sd.add(row_str, num_rows_added)
            else:
                # reconstruct the reference row
                ref_row = np.zeros(N)
                start_idx = self.indptr[ref]
                end_idx = self.indptr[ref + 1]
                ref_row[self.indices[start_idx:end_idx]] = self.data[
                    start_idx:end_idx]
                # now compute the difference
                row_as_array = row.toarray().flatten() if isspmatrix(
                    row) else row
                delta = row_as_array - ref_row
                # add the delta vector to the matrix
                data_added += self._append_row_data(delta, num_rows_added,
                                                    data_added)
                # update self.deltas to point to the reference row
                self.deltas[num_rows_added] = ref
                del ref_row
            num_rows_added += 1

        # Once all rows have been added we can infer the height, if not given
        if M is None:
            M = num_rows_added
        # sanity check that the number of rows added equals what we were told
        if num_rows_added != M:
            raise ValueError(
                "Number of rows provided not consistent with specified shape")

        # update this object's shape variable
        self.shape = (M, N)

        # since we dynamically allocate our data structures for performance
        # reasons, we must resize them to reflect how much data has actually
        # been added.
        self.data = np.resize(self.data, data_added)
        self.indices = np.resize(self.indices, data_added)
        self.indptr.resize(num_rows_added + 1)
        self.deltas.resize(num_rows_added)