Пример #1
0
    def _fit_eig(self, x):
        scatter_matrix = _scatter_matrix(x, self.arity)
        cov_matrix = _estimate_covariance(scatter_matrix, x.shape[0])

        if self.n_components:
            shape1 = self.n_components
        else:
            shape1 = x.shape[1]

        n_blocks = int(ceil(shape1 / x._reg_shape[1]))

        val_blocks = Array._get_out_blocks((1, n_blocks))
        vec_blocks = Array._get_out_blocks((n_blocks, x._n_blocks[1]))

        _decompose(cov_matrix, self.n_components, x._reg_shape[1],
                   val_blocks,
                   vec_blocks)

        bshape = (x._reg_shape[1], x._reg_shape[1])

        self.components_ = Array(vec_blocks, bshape, bshape,
                                 (shape1, x.shape[1]), False)
        self.explained_variance_ = Array(val_blocks, bshape, bshape,
                                         (1, shape1), False)

        return self
Пример #2
0
def _sort_v(v, sorting):
    v_blocks = [[] for _ in range(v._n_blocks[1])]
    hbsize = v._reg_shape[1]

    for i, vblock in enumerate(v._iterator("columns")):
        out_blocks = [[] for _ in range(v._n_blocks[1])]
        _sort_v_block(vblock._blocks, i, hbsize, sorting, out_blocks)

        for j in range(len(out_blocks)):
            v_blocks[j].append(out_blocks[j])

    vbsize = v._reg_shape[0]
    final_blocks = Array._get_out_blocks(v._n_blocks)

    for i, v_block in enumerate(v_blocks):
        new_block = [object() for _ in range(v._n_blocks[0])]
        _merge_svd_block(v_block, i, hbsize, vbsize, sorting, new_block)

        for j in range(len(new_block)):
            final_blocks[j][i] = new_block[j]

        for elem in v_block:
            compss_delete_object(elem)

    return Array(final_blocks, v._top_left_shape, v._reg_shape, v.shape,
                 v._sparse)
Пример #3
0
def _compute_u_sorted(a, sorting):
    u_blocks = [[] for _ in range(a._n_blocks[1])]
    hbsize = a._reg_shape[1]

    for i, vblock in enumerate(a._iterator("columns")):
        u_block = [object() for _ in range(a._n_blocks[1])]
        _compute_u_block_sorted(vblock._blocks, i, hbsize, sorting, u_block)

        for j in range(len(u_block)):
            u_blocks[j].append(u_block[j])

    vbsize = a._reg_shape[0]
    final_blocks = Array._get_out_blocks(a._n_blocks)

    for i, u_block in enumerate(u_blocks):
        new_block = [object() for _ in range(a._n_blocks[0])]
        _merge_svd_block(u_block, i, hbsize, vbsize, sorting, new_block)

        for j in range(len(new_block)):
            final_blocks[j][i] = new_block[j]

        for elem in u_block:
            compss_delete_object(elem)

    return Array(final_blocks, a._top_left_shape, a._reg_shape, a.shape,
                 a._sparse)
Пример #4
0
def _update_u(z, u_blocks, w_blocks, out_blocks):
    u_np = np.squeeze(Array._merge_blocks(u_blocks))
    w_np = np.squeeze(Array._merge_blocks(w_blocks))
    u_new = u_np + w_np - z
    n_cols = u_blocks[0][0].shape[1]

    for i in range(len(out_blocks)):
        out_blocks[i] = u_new[i * n_cols:(i + 1) * n_cols].reshape(1, -1)
Пример #5
0
def _partial_variability_params(x, y, mean_x, mean_y):
    x, y = Array._merge_blocks(x), Array._merge_blocks(y)

    normalized_x = x[:, 0] - mean_x  # the 0 is because only 1D LR is supported
    normalized_y = y - mean_y
    normalized_xy_dot = np.dot(normalized_x, normalized_y)
    normalized_xx_dot = np.dot(normalized_x, normalized_x)
    return normalized_xy_dot, normalized_xx_dot
Пример #6
0
def _score(x_list, y_list, clf):
    x = Array._merge_blocks(x_list)
    y = Array._merge_blocks(y_list)

    y_pred = clf.predict(x)
    equal = np.equal(y_pred, y.ravel())

    return np.sum(equal), x.shape[0]
Пример #7
0
    def kneighbors(self, x, n_neighbors=None, return_distance=True):
        """ Finds the K nearest neighbors of the input samples. Returns
        indices and distances to the neighbors of each sample.

        Parameters
        ----------
        x : ds-array, shape=(n_samples, n_features)
            The query samples.
        n_neighbors: int, optional (default=None)
            Number of neighbors to get. If None, the value passed in the
            constructor is employed.
        return_distance : boolean, optional (default=True)
            Whether to return distances.

        Returns
        -------
        dist : ds-array, shape=(n_samples, n_neighbors)
            Array representing the lengths to points, only present if
            return_distance=True.
        ind : ds-array, shape=(n_samples, n_neighbors)
            Indices of the nearest samples in the fitted data.
        """
        validation.check_is_fitted(self, '_fit_data')

        if n_neighbors is None:
            n_neighbors = self.n_neighbors

        distances = []
        indices = []

        for q_row in x._iterator(axis=0):
            queries = []

            for row in self._fit_data._iterator(axis=0):
                queries.append(
                    _get_neighbors(row._blocks, q_row._blocks, n_neighbors))

            dist, ind = _merge_queries(*queries)
            distances.append([dist])
            indices.append([ind])

        ind_arr = Array(blocks=indices,
                        top_left_shape=(x._top_left_shape[0], n_neighbors),
                        reg_shape=(x._reg_shape[0], n_neighbors),
                        shape=(x.shape[0], n_neighbors),
                        sparse=False)

        if return_distance:
            dst_arr = Array(blocks=distances,
                            top_left_shape=(x._top_left_shape[0], n_neighbors),
                            reg_shape=(x._reg_shape[0], n_neighbors),
                            shape=(x.shape[0], n_neighbors),
                            sparse=False)
            return dst_arr, ind_arr

        return ind_arr
Пример #8
0
def _partial_estimate_parameters(x, resp):
    x = Array._merge_blocks(x)
    resp = Array._merge_blocks(resp)
    partial_nk = resp.sum(axis=0)
    if issparse(x):
        partial_means = x.T.dot(resp).T
    else:
        partial_means = np.matmul(resp.T, x)

    return x.shape[0], partial_nk, partial_means
Пример #9
0
def _partial_covar_diag(resp, x, means):
    x = Array._merge_blocks(x)
    resp = Array._merge_blocks(resp)
    if issparse(x):
        avg_resp_sample_2 = x.multiply(x).T.dot(resp).T
        avg_sample_means = means * x.T.dot(resp).T
    else:
        avg_resp_sample_2 = np.dot(resp.T, x * x)
        avg_sample_means = means * np.dot(resp.T, x)
    return avg_resp_sample_2 - 2 * avg_sample_means
Пример #10
0
def _choose_and_assign_rows_xy(x, y, subsamples_sizes, subsamples, seed):
    np.random.seed(seed)
    x = Array._merge_blocks(x)
    y = Array._merge_blocks(y)
    indices = np.random.permutation(x.shape[0])
    start = 0
    for i, size in enumerate(subsamples_sizes):
        end = start + size
        subsamples[i] = (x[indices[start:end]], y[indices[start:end]])
        start = end
Пример #11
0
def _generate_bins(mn, mx, dimensions, n_regions):
    bins = []
    mn_arr = Array._merge_blocks(mn)[0]
    mx_arr = Array._merge_blocks(mx)[0]

    # create bins for the different regions in the grid in every dimension
    for dim in dimensions:
        bin_ = np.linspace(mn_arr[dim], mx_arr[dim], n_regions + 1)
        bins.append(bin_)

    return bins
Пример #12
0
def _get_neighbors(blocks, q_blocks, n_neighbors):
    samples = Array._merge_blocks(blocks)
    q_samples = Array._merge_blocks(q_blocks)

    n_samples = samples.shape[0]

    knn = SKNeighbors(n_neighbors=n_neighbors)
    knn.fit(X=samples)
    dist, ind = knn.kneighbors(X=q_samples)

    return dist, ind, n_samples
Пример #13
0
    def _u_step(self):
        u_blocks = []

        for u_hblock, w_hblock in zip(self._u._iterator(),
                                      self._w._iterator()):
            out_blocks = [object() for _ in range(self._u._n_blocks[1])]
            _update_u(self._z, u_hblock._blocks, w_hblock._blocks, out_blocks)
            u_blocks.append(out_blocks)

        r_shape = self._u._reg_shape
        shape = self._u.shape
        self._u = Array(u_blocks, r_shape, r_shape, shape, self._u._sparse)
Пример #14
0
def _transform(blocks, m_blocks, v_blocks, out_blocks):
    x = Array._merge_blocks(blocks)
    mean = Array._merge_blocks(m_blocks)
    var = Array._merge_blocks(v_blocks)
    scaled_x = (x - mean) / np.sqrt(var)

    constructor_func = np.array if not issparse(x) else csr_matrix
    start, end = 0, 0

    for i, block in enumerate(blocks[0]):
        end += block.shape[1]
        out_blocks[i] = constructor_func(scaled_x[:, start:end])
Пример #15
0
def _merge(x_list, y_list, id_list):
    samples = Array._merge_blocks(x_list)
    labels = Array._merge_blocks(y_list)
    sample_ids = Array._merge_blocks(id_list)

    _, uniques = np.unique(sample_ids, return_index=True)
    indices = np.argsort(uniques)
    uniques = uniques[indices]

    sample_ids = sample_ids[uniques]
    samples = samples[uniques]
    labels = labels[uniques]

    return samples, labels, sample_ids
Пример #16
0
    def _compute_primal_res(self, z_old):
        blocks = []

        for w_hblock in self._w._iterator():
            out_blocks = [object() for _ in range(self._w._n_blocks[1])]
            _substract(w_hblock._blocks, z_old, out_blocks)
            blocks.append(out_blocks)

        prires = Array(blocks, self._w._reg_shape, self._w._reg_shape,
                       self._w.shape, self._w._sparse)

        # this should be a ds-array of a single element. We return only the
        # block
        return (prires.norm(axis=1)**2).sum().sqrt()
Пример #17
0
def transform_to_rf_dataset(x: Array, y: Array) -> RfDataset:
    """Creates a RfDataset object from samples x and labels y.

    This function creates a dislib.classification.rf.data.RfDataset by saving
    x and y in files.

    Parameters
    ----------
    x : ds-array, shape = (n_samples, n_features)
        The training input samples.
    y : ds-array, shape = (n_samples,) or (n_samples, n_outputs)
        The target values.

    Returns
    -------
    rf_dataset : dislib.classification.rf._data.RfDataset

    """
    n_samples = x.shape[0]
    n_features = x.shape[1]

    samples_file = tempfile.NamedTemporaryFile(mode='wb',
                                               prefix='tmp_rf_samples_',
                                               delete=False)
    samples_path = samples_file.name
    samples_file.close()
    _allocate_samples_file(samples_path, n_samples, n_features)

    start_idx = 0
    row_blocks_iterator = x._iterator(axis=0)
    top_row = next(row_blocks_iterator)
    _fill_samples_file(samples_path, top_row._blocks, start_idx)
    start_idx += x._top_left_shape[0]
    for x_row in row_blocks_iterator:
        _fill_samples_file(samples_path, x_row._blocks, start_idx)
        start_idx += x._reg_shape[0]

    labels_file = tempfile.NamedTemporaryFile(mode='w',
                                              prefix='tmp_rf_labels_',
                                              delete=False)
    labels_path = labels_file.name
    labels_file.close()
    for y_row in y._iterator(axis=0):
        _fill_labels_file(labels_path, y_row._blocks)

    rf_dataset = RfDataset(samples_path, labels_path)
    rf_dataset.n_samples = n_samples
    rf_dataset.n_features = n_features
    return rf_dataset
Пример #18
0
def _compute_var(blocks, m_blocks):
    x = Array._merge_blocks(blocks)
    mean = Array._merge_blocks(m_blocks)
    sparse = issparse(x)

    if sparse:
        x = x.toarray()
        mean = mean.toarray()

    var = np.mean(np.array(x - mean)**2, axis=0)

    if sparse:
        return csr_matrix(var)
    else:
        return var
Пример #19
0
def _soft_thresholding(w_blocks, u_blocks, k):
    w_mean = np.squeeze(Array._merge_blocks(w_blocks))
    u_mean = np.squeeze(Array._merge_blocks(u_blocks))
    v = w_mean + u_mean

    z = np.zeros(v.shape)
    for i in range(z.shape[0]):
        if np.abs(v[i]) <= k:
            z[i] = 0
        else:
            if v[i] > k:
                z[i] = v[i] - k
            else:
                z[i] = v[i] + k
    return z
Пример #20
0
def _partial_covar_full(resp, x, means):
    x = Array._merge_blocks(x)
    resp = Array._merge_blocks(resp)
    n_components, n_features = means.shape
    covariances = np.empty((n_components, n_features, n_features))
    for k in range(n_components):
        if issparse(x):
            diff = (x - means[k] for x in x)
            partial_covs = (np.dot(r * d.T, d) for d, r in
                            zip(diff, resp[:, k]))
            covariances[k] = sum(partial_covs)
        else:
            diff = x - means[k]
            covariances[k] = np.dot(resp[:, k] * diff.T, diff)
    return covariances
Пример #21
0
    def transform(self, x):
        """
        Standarize data.

        Parameters
        ----------
        x : ds-array, shape=(n_samples, n_features)

        Returns
        -------
        x_new : ds-array, shape=(n_samples, n_features)
            Scaled data.
        """
        if self.mean_ is None or self.var_ is None:
            raise Exception("Model has not been initialized.")

        n_blocks = x._n_blocks[1]
        blocks = []
        m_blocks = self.mean_._blocks
        v_blocks = self.var_._blocks

        for row in x._iterator(axis=0):
            out_blocks = [object() for _ in range(n_blocks)]
            _transform(row._blocks, m_blocks, v_blocks, out_blocks)
            blocks.append(out_blocks)

        return Array(blocks,
                     top_left_shape=x._top_left_shape,
                     reg_shape=x._reg_shape,
                     shape=x.shape,
                     sparse=x._sparse)
Пример #22
0
    def fit(self, x, y):
        """
        Fits the model with training data.

        Parameters
        ----------
        x : ds-array, shape=(n_samples, n_features)
            Training samples.
        y : ds-array, shape=(n_samples, 1)
            Class labels of x.

        Returns
        -------
        self : ADMM
        """
        if not x._is_regular():
            x_reg = x.rechunk(x._reg_shape)
        else:
            x_reg = x

        self._init_model(x_reg)

        while not self.converged_ and self.n_iter_ < self.max_iter:
            self._step(x_reg, y)
            self.n_iter_ += 1

            if self.verbose:
                print("Iteration ", self.n_iter_)

        z_blocks = [object() for _ in range(x_reg._n_blocks[1])]
        _split_z(self._z, x._reg_shape[1], z_blocks)
        self.z_ = Array([z_blocks], (1, x._reg_shape[1]), (1, x._reg_shape[1]),
                        (1, x.shape[1]), False)

        return self
Пример #23
0
def _partial_covar_tied(x):
    x = Array._merge_blocks(x)
    if issparse(x):
        avg_sample_2 = x.T.dot(x)
    else:
        avg_sample_2 = np.dot(x.T, x)
    return avg_sample_2
Пример #24
0
    def predict(self, x):
        """
        Predict using the linear model.

        Parameters
        ----------
        x : ds-array
            Samples to be predicted: x.shape (n_samples, 1).
        Returns
        -------
        y : ds-array
            Predicted values
        """

        blocks = [list()]

        for r_block in x._iterator(axis='rows'):
            blocks[0].append(
                _predict(r_block._blocks, self.coef_, self.intercept_))

        return Array(blocks=blocks,
                     top_left_shape=(x._top_left_shape[0], 1),
                     reg_shape=(x._reg_shape[0], 1),
                     shape=(x.shape[0], 1),
                     sparse=x._sparse)
Пример #25
0
def _rotate(coli_blocks, colj_blocks, j):
    if j is None:
        return

    coli = Array._merge_blocks(coli_blocks)
    colj = Array._merge_blocks(colj_blocks)

    n = coli.shape[1]
    coli_k = coli @ j[:n, :n] + colj @ j[n:, :n]
    colj_k = coli @ j[:n, n:] + colj @ j[n:, n:]

    block_size = coli_blocks[0][0].shape[0]

    for i in range(len(coli_blocks)):
        coli_blocks[i][0][:] = coli_k[i * block_size:(i + 1) * block_size][:]
        colj_blocks[i][0][:] = colj_k[i * block_size:(i + 1) * block_size][:]
Пример #26
0
def _subset_transform(blocks, u_blocks, c_blocks, reg_shape, out_blocks):
    data = Array._merge_blocks(blocks)
    mean = Array._merge_blocks(u_blocks)
    components = Array._merge_blocks(c_blocks)

    if issparse(data):
        data = data.toarray()
        mean = mean.toarray()

    res = (np.matmul(data - mean, components.T))

    if issparse(data):
        res = csr_matrix(res)

    for j in range(0, len(blocks[0])):
        out_blocks[j] = res[:, j * reg_shape:(j + 1) * reg_shape]
Пример #27
0
def _subset_scatter_matrix(blocks):
    data = Array._merge_blocks(blocks)

    if issparse(data):
        data = data.toarray()

    return np.dot(data.T, data)
Пример #28
0
    def predict(self, x):
        """ Perform classification on samples.

        Parameters
        ----------
        x : ds-array, shape=(n_samples, n_features)
            Input samples.

        Returns
        -------
        y : ds-array, shape(n_samples, 1)
            Class labels of x.
        """
        assert (self._clf is not None or self._svs is not None), \
            "Model has not been initialized. Call fit() first."

        y_list = []

        for row in x._iterator(axis=0):
            y_list.append([_predict(row._blocks, self._clf)])

        return Array(blocks=y_list,
                     top_left_shape=(x._top_left_shape[0], 1),
                     reg_shape=(x._reg_shape[0], 1),
                     shape=(x.shape[0], 1),
                     sparse=False)
Пример #29
0
def _load_mdcrd(path, block_size, n_cols, n_blocks, bytes_per_snap,
                bytes_per_block):
    blocks = []

    file_size = os.stat(path).st_size - _CRD_LINE_SIZE

    try:
        fid = open(path, "rb")
        fid.read(_CRD_LINE_SIZE)  # skip header

        for _ in range(0, file_size, bytes_per_block):
            data = fid.read(bytes_per_block)
            out_blocks = [object() for _ in range(n_blocks)]
            _read_crd_bytes(data, block_size[1], n_cols, out_blocks)
            compss_delete_object(data)
            blocks.append(out_blocks)
    finally:
        fid.close()

    n_samples = int(file_size / bytes_per_snap)

    return Array(blocks,
                 top_left_shape=block_size,
                 reg_shape=block_size,
                 shape=(n_samples, n_cols),
                 sparse=False)
Пример #30
0
    def decision_function(self, x):
        """ Evaluates the decision function for the samples in x.

        Parameters
        ----------
        x : ds-array, shape=(n_samples, n_features)
            Input samples.

        Returns
        -------
        df : ds-array, shape=(n_samples, 2)
            The decision function of the samples for each class in the model.
        """
        assert (self._clf is not None or self._svs is not None), \
            "Model has not been initialized. Call fit() first."

        df = []

        for row in x._iterator(axis=0):
            df.append([_decision_function(row._blocks, self._clf)])

        return Array(blocks=df,
                     top_left_shape=(x._top_left_shape[0], 1),
                     reg_shape=(x._reg_shape[0], 1),
                     shape=(x.shape[0], 1),
                     sparse=False)