Пример #1
0
    def _mapfill_hprobs_atom(self, array_to_fill, dest_indices,
                             dest_param_indices1, dest_param_indices2,
                             layout_atom, param_indices1, param_indices2,
                             resource_alloc, eps):
        """
        Helper function for populating hessian values by block.
        """
        shared_mem_leader = resource_alloc.is_host_leader if (
            resource_alloc is not None) else True

        if param_indices1 is None:
            param_indices1 = list(range(self.model.num_params))
        if param_indices2 is None:
            param_indices2 = list(range(self.model.num_params))
        if dest_param_indices1 is None:
            dest_param_indices1 = list(range(_slct.length(param_indices1)))
        if dest_param_indices2 is None:
            dest_param_indices2 = list(range(_slct.length(param_indices2)))

        param_indices1 = _slct.to_array(param_indices1)
        dest_param_indices1 = _slct.to_array(dest_param_indices1)
        #dest_param_indices2 = _slct.to_array(dest_param_indices2)  # OK if a slice

        #Get a map from global parameter indices to the desired
        # final index within mx_to_fill (fpoffset = final parameter offset)
        iParamToFinal = {
            i: dest_index
            for i, dest_index in zip(param_indices1, dest_param_indices1)
        }

        nEls = layout_atom.num_elements
        nP2 = _slct.length(param_indices2) if isinstance(
            param_indices2, slice) else len(param_indices2)
        dprobs, shm = _smt.create_shared_ndarray(resource_alloc, (nEls, nP2),
                                                 'd')
        dprobs2, shm2 = _smt.create_shared_ndarray(resource_alloc, (nEls, nP2),
                                                   'd')
        self.calclib.mapfill_dprobs_atom(self, dprobs, slice(0, nEls), None,
                                         layout_atom, param_indices2,
                                         resource_alloc, eps)

        orig_vec = self.model.to_vector().copy()
        for i in range(self.model.num_params):
            if i in iParamToFinal:
                iFinal = iParamToFinal[i]
                vec = orig_vec.copy()
                vec[i] += eps
                self.model.from_vector(vec, close=True)
                self.calclib.mapfill_dprobs_atom(self, dprobs2, slice(0, nEls),
                                                 None, layout_atom,
                                                 param_indices2,
                                                 resource_alloc, eps)
                if shared_mem_leader:
                    _fas(array_to_fill,
                         [dest_indices, iFinal, dest_param_indices2],
                         (dprobs2 - dprobs) / eps)
        self.model.from_vector(orig_vec)
        _smt.cleanup_shared_ndarray(shm)
        _smt.cleanup_shared_ndarray(shm2)
Пример #2
0
 def _bulk_fill_hprobs_atom(self, array_to_fill, dest_param_slice1,
                            dest_param_slice2, layout_atom, param_slice1,
                            param_slice2, resource_alloc):
     # Note: *don't* set dest_indices arg = layout.element_slice, as this is already done by caller
     resource_alloc.check_can_allocate_memory(
         layout_atom.cache_size * self.model.dim *
         _slct.length(param_slice1) * _slct.length(param_slice2))
     self._mapfill_hprobs_atom(array_to_fill,
                               slice(0, array_to_fill.shape[0]),
                               dest_param_slice1, dest_param_slice2,
                               layout_atom, param_slice1, param_slice2,
                               resource_alloc, self.hessian_eps)
Пример #3
0
    def _success_dprob(self, circuit, param_slice, cache):
        """
        todo
        """
        assert(param_slice is None or _slct.length(param_slice) == len(self._paramvec)), \
            "No support for derivatives with respect to a subset of model parameters yet!"
        pvec = self._paramvec**2
        dpvec_dparams = 2 * self._paramvec

        if cache is None:
            cache = self._circuit_cache(circuit)

        one_over_2_width, all_inds_to_mult, all_inds_to_mult_cnt = cache
        sp = 1.0 - pvec

        successprob_all_ops = prod(sp[all_inds_to_mult])
        deriv = _np.zeros(len(pvec), 'd')
        for i, n in enumerate(all_inds_to_mult_cnt):
            deriv[i] = n * successprob_all_ops / sp[i] * -1.0

        # The circuit succeeds if all ops succeed, and has a random outcome otherwise.
        # successprob_circuit = successprob_all_ops + (1 - successprob_all_ops) / 2**width
        # = const + (1-1/2**width)*successprobs_all_ops
        deriv *= (1.0 - one_over_2_width)
        return deriv * dpvec_dparams
Пример #4
0
    def _iter_hprobs_by_rectangle(self, layout, wrt_slices_list, return_dprobs_12):
        # Just needed for compatibility - so base `iter_hprobs_by_rectangle` knows to loop over atoms
        # Similar to _iter_atom_hprobs_by_rectangle but runs over all atoms before yielding and
        #  yielded array has leading dim == # of local elements instead of just 1 atom's # elements.
        nElements = layout.num_elements
        resource_alloc = layout.resource_alloc()
        for wrtSlice1, wrtSlice2 in wrt_slices_list:

            if return_dprobs_12:
                dprobs1, dprobs1_shm = _smt.create_shared_ndarray(resource_alloc, (nElements, _slct.length(wrtSlice1)),
                                                                  'd', zero_out=True)
                dprobs2, dprobs2_shm = _smt.create_shared_ndarray(resource_alloc, (nElements, _slct.length(wrtSlice2)),
                                                                  'd', zero_out=True)
            else:
                dprobs1 = dprobs2 = dprobs1_shm = dprobs2_shm = None

            hprobs, hprobs_shm = _smt.create_shared_ndarray(
                resource_alloc, (nElements, _slct.length(wrtSlice1), _slct.length(wrtSlice2)),
                'd', zero_out=True)

            for atom in layout.atoms:
                self._bulk_fill_hprobs_dprobs_atom(hprobs[atom.element_slice, :, :],
                                                   dprobs1[atom.element_slice, :] if (dprobs1 is not None) else None,
                                                   dprobs2[atom.element_slice, :] if (dprobs2 is not None) else None,
                                                   atom, wrtSlice1, wrtSlice2, resource_alloc)
            #Note: we give resource_alloc as our local `resource_alloc` above because all the arrays
            # have been allocated based on just this subset of processors, unlike a call to bulk_fill_hprobs(...)
            # where the probs & dprobs are memory allocated and filled by a larger group of processors.  (the main
            # function of these args is to know which procs work together to fill the *same* values and which of
            # these are on the *same* host so that only one per host actually writes to the assumed-shared memory.

            if return_dprobs_12:
                dprobs12 = dprobs1[:, :, None] * dprobs2[:, None, :]  # (KM,N,1) * (KM,1,N') = (KM,N,N')
                yield wrtSlice1, wrtSlice2, hprobs, dprobs12
            else:
                yield wrtSlice1, wrtSlice2, hprobs

            _smt.cleanup_shared_ndarray(dprobs1_shm)
            _smt.cleanup_shared_ndarray(dprobs2_shm)
            _smt.cleanup_shared_ndarray(hprobs_shm)
def mapfill_dprobs_atom(fwdsim, mx_to_fill, dest_indices, dest_param_indices,
                        layout_atom, param_indices, resource_alloc, eps):

    #eps = 1e-7
    #shared_mem_leader = resource_alloc.is_host_leader if (resource_alloc is not None) else True

    if param_indices is None:
        param_indices = list(range(fwdsim.model.num_params))
    if dest_param_indices is None:
        dest_param_indices = list(range(_slct.length(param_indices)))

    param_indices = _slct.to_array(param_indices)
    dest_param_indices = _slct.to_array(dest_param_indices)

    #Get a map from global parameter indices to the desired
    # final index within mx_to_fill (fpoffset = final parameter offset)
    iParamToFinal = {
        i: dest_index
        for i, dest_index in zip(param_indices, dest_param_indices)
    }

    orig_vec = fwdsim.model.to_vector().copy()
    fwdsim.model.from_vector(
        orig_vec, close=False)  # ensure we call with close=False first

    #Note: no real need for using shared memory here except so that we can pass
    # `resource_alloc` to mapfill_probs_block and have it potentially use multiple procs.
    nEls = layout_atom.num_elements
    probs, shm = _smt.create_shared_ndarray(resource_alloc, (nEls, ),
                                            'd',
                                            memory_tracker=None)
    probs2, shm2 = _smt.create_shared_ndarray(resource_alloc, (nEls, ),
                                              'd',
                                              memory_tracker=None)
    mapfill_probs_atom(fwdsim, probs, slice(0, nEls), layout_atom,
                       resource_alloc)  # probs != shared

    for i in range(fwdsim.model.num_params):
        #print("dprobs cache %d of %d" % (i,self.Np))
        if i in iParamToFinal:
            iFinal = iParamToFinal[i]
            vec = orig_vec.copy()
            vec[i] += eps
            fwdsim.model.from_vector(vec, close=True)
            mapfill_probs_atom(fwdsim, probs2, slice(0, nEls), layout_atom,
                               resource_alloc)
            _fas(mx_to_fill, [dest_indices, iFinal], (probs2 - probs) / eps)
    fwdsim.model.from_vector(orig_vec, close=True)
    _smt.cleanup_shared_ndarray(shm)
    _smt.cleanup_shared_ndarray(shm2)
Пример #6
0
    def _iter_atom_hprobs_by_rectangle(self, atom, wrt_slices_list, return_dprobs_12, resource_alloc):

        #FUTURE could make a resource_alloc.check_can_allocate_memory call here for ('epp', 'epp')?
        nElements = atom.num_elements
        for wrtSlice1, wrtSlice2 in wrt_slices_list:

            if return_dprobs_12:
                dprobs1, dprobs1_shm = _smt.create_shared_ndarray(resource_alloc, (nElements, _slct.length(wrtSlice1)),
                                                                  'd', zero_out=True)
                dprobs2, dprobs2_shm = _smt.create_shared_ndarray(resource_alloc, (nElements, _slct.length(wrtSlice2)),
                                                                  'd', zero_out=True)
            else:
                dprobs1 = dprobs2 = dprobs1_shm = dprobs2_shm = None

            hprobs, hprobs_shm = _smt.create_shared_ndarray(
                resource_alloc, (nElements, _slct.length(wrtSlice1), _slct.length(wrtSlice2)),
                'd', zero_out=True)

            # Note: no need to index w/ [atom.element_slice,...] (compare with _iter_hprobs_by_rectangles)
            # since these arrays are already sized to this particular atom (not to all the host's atoms)
            self._bulk_fill_hprobs_dprobs_atom(hprobs, dprobs1, dprobs2, atom,
                                               wrtSlice1, wrtSlice2, resource_alloc)
            #Note: we give resource_alloc as our local `resource_alloc` above because all the arrays
            # have been allocated based on just this subset of processors, unlike a call to bulk_fill_hprobs(...)
            # where the probs & dprobs are memory allocated and filled by a larger group of processors.  (the main
            # function of these args is to know which procs work together to fill the *same* values and which of
            # these are on the *same* host so that only one per host actually writes to the assumed-shared memory.

            if return_dprobs_12:
                dprobs12 = dprobs1[:, :, None] * dprobs2[:, None, :]  # (KM,N,1) * (KM,1,N') = (KM,N,N')
                yield wrtSlice1, wrtSlice2, hprobs, dprobs12
            else:
                yield wrtSlice1, wrtSlice2, hprobs

            _smt.cleanup_shared_ndarray(dprobs1_shm)
            _smt.cleanup_shared_ndarray(dprobs2_shm)
            _smt.cleanup_shared_ndarray(hprobs_shm)
Пример #7
0
    def _success_dprob(self, circuit, param_slice, cache):
        assert(param_slice is None or _slct.length(param_slice) == len(self._paramvec)), \
            "No support for derivatives with respect to a subset of model parameters yet!"
        pvec = self._paramvec**2
        dpvec_dparams = 2 * self._paramvec

        if cache is None:
            cache = self._circuit_cache(circuit)

        all_inds_to_mult, all_inds_to_mult_cnt = cache
        sp = 1.0 - pvec
        successprob_circuit = prod(sp[all_inds_to_mult])
        deriv = _np.zeros(len(pvec), 'd')
        for i, n in enumerate(all_inds_to_mult_cnt):
            deriv[i] = n * successprob_circuit / sp[i] * -1.0

        return deriv * dpvec_dparams
Пример #8
0
    def _success_dprob(self, circuit, param_slice, cache):
        assert(param_slice is None or _slct.length(param_slice) == len(self._paramvec)), \
            "No support for derivatives with respect to a subset of model parameters yet!"
        pvec = self._paramvec**2
        dpvec_dparams = 2 * self._paramvec

        if cache is None:
            cache = self._circuit_cache(circuit)

        # p = product_layers(1 - alpha * (1 - prod_[inds4layer](1 - param))) * \
        #     (prod_[inds4LASTlayer](1 - param) - 1 / 2**width)
        # Note: indices cannot be repeated in a layer, i.e. either a given index appears one or zero times in inds4layer

        width, depth, alpha, one_over_2_width, inds_to_mult_by_layer = cache
        sp = 1.0 - pvec
        deriv = _np.zeros(len(pvec), 'd')

        nLayers = len(inds_to_mult_by_layer)
        lambda_per_layer = _np.empty(nLayers, 'd')
        for i, inds_to_mult in enumerate(inds_to_mult_by_layer[:-1]):
            lambda_per_layer[i] = 1 - alpha * (1 - prod(sp[inds_to_mult]))

        successprob_readout = prod(sp[inds_to_mult_by_layer[-1]])
        lambda_per_layer[nLayers - 1] = successprob_readout - one_over_2_width
        lambda_all_layers = prod(
            lambda_per_layer)  # includes readout factor as last layer

        #All layers except last
        for i, inds_to_mult in enumerate(inds_to_mult_by_layer[:-1]):
            lambda_all_but_current_layer = lambda_all_layers / lambda_per_layer[
                i]
            # for each such ind, when we take deriv wrt this index, we need to differentiate this layer, etc.
            for ind in inds_to_mult:
                deriv[ind] += lambda_all_but_current_layer * alpha * \
                    (prod(sp[inds_to_mult]) / sp[ind]) * -1.0  # what if sp[ind] == 0?

        #Last layer
        lambda_all_but_current_layer = lambda_all_layers / lambda_per_layer[-1]
        for ind in inds_to_mult_by_layer[-1]:
            deriv[ind] += lambda_all_but_current_layer * (
                successprob_readout / sp[ind]) * -1.0  # what if sp[ind] == 0?

        return deriv * dpvec_dparams
Пример #9
0
    def _bulk_fill_dprobs(self, array_to_fill, layout, pr_array_to_fill):
        """Note: we expect that array_to_fill points to the memory specifically for this processor
           (a subset of the memory for the host when memory is shared) """
        blkSize = layout.param_dimension_blk_sizes[0]
        atom_resource_alloc = layout.resource_alloc('atom-processing')
        param_resource_alloc = layout.resource_alloc('param-processing')

        atom_resource_alloc.host_comm_barrier()  # ensure all procs have finished w/shared memory before we reinit
        # Note: use *largest* host comm that we fill - so 'atom' comm, not 'param' comm

        host_param_slice = None  # layout.host_param_slice  # array_to_fill is already just this slice of the host mem
        global_param_slice = layout.global_param_slice

        for atom in layout.atoms:
            #assert(_slct.length(atom.element_slice) == atom.num_elements)  # for debugging
            #print("DEBUG: Atom %d of %d slice=%s" % (iDB, len(layout.atoms), str(atom.element_slice)))

            if pr_array_to_fill is not None:
                self._bulk_fill_probs_atom(pr_array_to_fill[atom.element_slice], atom, atom_resource_alloc)

            if blkSize is None:  # avoid unnecessary slice_up_range and block loop logic in 'else' block
                #Compute all of our derivative columns at once
                self._bulk_fill_dprobs_atom(array_to_fill[atom.element_slice, :], host_param_slice, atom,
                                            global_param_slice, param_resource_alloc)

            else:  # Divide columns into blocks of at most blkSize
                Np = _slct.length(global_param_slice)  # total number of parameters we're computing
                nBlks = int(_np.ceil(Np / blkSize))  # num blocks required to achieve desired average size == blkSize
                blocks = _mpit.slice_up_range(Np, nBlks)  # blocks contain indices into final_array[host_param_slice]

                for block in blocks:
                    host_param_slice_part = block  # _slct.shift(block, host_param_slice.start)  # into host's memory
                    global_param_slice_part = _slct.shift(block, global_param_slice.start)  # actual parameter indices
                    self._bulk_fill_dprobs_atom(array_to_fill[atom.element_slice, :], host_param_slice_part, atom,
                                                global_param_slice_part, param_resource_alloc)

        atom_resource_alloc.host_comm_barrier()  # don't exit until all procs' array_to_fill is ready
Пример #10
0
    def _success_dprob(self, circuit, param_slice, cache):
        """
        todo
        """
        assert(param_slice is None or _slct.length(param_slice) == len(self._paramvec)), \
            "No support for derivatives with respect to a subset of model parameters yet!"
        pvec = self._paramvec**2
        dpvec_dparams = 2 * self._paramvec

        if cache is None:
            cache = self._circuit_cache(circuit)

        width, depth, alpha, one_over_2_width, all_inds_to_mult, readout_inds_to_mult, all_inds_to_mult_cnt = cache
        sp = 1.0 - pvec
        lambda_ops = 1.0 - alpha * pvec
        deriv = _np.zeros(len(pvec), 'd')

        # The depolarizing constant for the full sequence of twirled gates.
        lambda_all_layers = prod(lambda_ops[all_inds_to_mult])
        for i, n in enumerate(all_inds_to_mult_cnt):
            deriv[i] = n * lambda_all_layers / lambda_ops[
                i] * -alpha  # -alpha = d(lambda_ops/dparam)

        # The readout success probability.
        readout_deriv = _np.zeros(len(pvec), 'd')
        successprob_readout = prod(sp[readout_inds_to_mult])
        for ind in readout_inds_to_mult:
            readout_deriv[ind] = (successprob_readout /
                                  sp[ind]) * -1.0  # what if sp[ind] == 0?

        # The success probability of the circuit.
        #successprob_circuit = lambda_all_layers * (successprob_readout - one_over_2_width) + one_over_2_width

        # product rule
        return (deriv * (successprob_readout - one_over_2_width) +
                lambda_all_layers * readout_deriv) * dpvec_dparams
Пример #11
0
        def _jacobian_fn(gauge_group_el):

            #Penalty terms below always act on the transformed non-target model.
            original_gauge_group_el = gauge_group_el

            if frobenius_transform_target:
                gauge_group_el = gauge_group_el.inverse()
                mdl_pre = full_target_model.copy()
                mdl_post = mdl_pre.copy()
            else:
                mdl_pre = model.copy()
                mdl_post = mdl_pre.copy()
            mdl_post.transform_inplace(gauge_group_el)

            # Indices: Jacobian output matrix has shape (L, N)
            start = 0
            d = mdl_pre.dim
            N = gauge_group_el.num_params
            L = mdl_pre.num_elements

            #Compute "extra" (i.e. beyond the model-element) rows of jacobian
            if cptp_penalty_factor != 0: L += _cptp_penalty_size(mdl_pre)
            if spam_penalty_factor != 0: L += _spam_penalty_size(mdl_pre)

            #Set basis for pentaly term calculation
            if cptp_penalty_factor != 0 or spam_penalty_factor != 0:
                mdl_pre.basis = mxBasis
                mdl_post.basis = mxBasis

            jacMx = _np.zeros((L, N))

            #Overview of terms:
            # objective: op_term = (S_inv * gate * S - target_op)
            # jac:       d(op_term) = (d (S_inv) * gate * S + S_inv * gate * dS )
            #            d(op_term) = (-(S_inv * dS * S_inv) * gate * S + S_inv * gate * dS )

            # objective: rho_term = (S_inv * rho - target_rho)
            # jac:       d(rho_term) = d (S_inv) * rho
            #            d(rho_term) = -(S_inv * dS * S_inv) * rho

            # objective: ET_term = (E.T * S - target_E.T)
            # jac:       d(ET_term) = E.T * dS

            #Overview of terms when frobenius_transform_target == True).  Note that the objective
            #expressions are identical to the above except for an additional overall minus sign and S <=> S_inv.

            # objective: op_term = (gate - S * target_op * S_inv)
            # jac:       d(op_term) = -(dS * target_op * S_inv + S * target_op * -(S_inv * dS * S_inv) )
            #            d(op_term) = (-dS * target_op * S_inv + S * target_op * (S_inv * dS * S_inv) )

            # objective: rho_term = (rho - S * target_rho)
            # jac:       d(rho_term) = - dS * target_rho

            # objective: ET_term = (E.T - target_E.T * S_inv)
            # jac:       d(ET_term) = - target_E.T * -(S_inv * dS * S_inv)
            #            d(ET_term) = target_E.T * (S_inv * dS * S_inv)

            #Distribute computation across processors
            allDerivColSlice = slice(0, N)
            derivSlices, myDerivColSlice, derivOwners, mySubComm = \
                _mpit.distribute_slice(allDerivColSlice, comm)
            if mySubComm is not None:
                _warnings.warn("Note: more CPUs(%d)" % comm.Get_size()
                               + " than gauge-opt derivative columns(%d)!" % N)  # pragma: no cover

            n = _slct.length(myDerivColSlice)
            wrtIndices = _slct.indices(myDerivColSlice) if (n < N) else None
            my_jacMx = jacMx[:, myDerivColSlice]  # just the columns I'm responsible for

            # S, and S_inv are shape (d,d)
            #S       = gauge_group_el.transform_matrix
            S_inv = gauge_group_el.transform_matrix_inverse
            dS = gauge_group_el.deriv_wrt_params(wrtIndices)  # shape (d*d),n
            dS.shape = (d, d, n)  # call it (d1,d2,n)
            dS = _np.rollaxis(dS, 2)  # shape (n, d1, d2)
            assert(dS.shape == (n, d, d))

            # --- NOTE: ordering here, with running `start` index MUST
            #           correspond to those in Model.residuals, which in turn
            #           must correspond to those in ForwardSimulator.residuals - which
            #           currently orders as: gates, simplified_ops, preps, effects.

            # -- LinearOperator terms
            # -------------------------
            for lbl, G in mdl_pre.operations.items():
                # d(op_term) = S_inv * (-dS * S_inv * G * S + G * dS) = S_inv * (-dS * G' + G * dS)
                #   Note: (S_inv * G * S) is G' (transformed G)
                wt = item_weights.get(lbl, opWeight)
                left = -1 * _np.dot(dS, mdl_post.operations[lbl].to_dense(on_space='minimal'))  # shape (n,d1,d2)
                right = _np.swapaxes(_np.dot(G.to_dense(on_space='minimal'), dS), 0, 1)  # shape (d1,n,d2) -> (n,d1,d2)
                result = _np.swapaxes(_np.dot(S_inv, left + right), 1, 2)  # shape (d1, d2, n)
                result = result.reshape((d**2, n))  # must copy b/c non-contiguous
                my_jacMx[start:start + d**2] = wt * result
                start += d**2

            # -- Instrument terms
            # -------------------------
            for ilbl, Inst in mdl_pre.instruments.items():
                wt = item_weights.get(ilbl, opWeight)
                for lbl, G in Inst.items():
                    # same calculation as for operation terms
                    left = -1 * _np.dot(dS, mdl_post.instruments[ilbl][lbl].to_dense(on_space='minimal'))  # (n,d1,d2)
                    right = _np.swapaxes(_np.dot(G.to_dense(on_space='minimal'), dS), 0, 1)  # (d1,n,d2) -> (n,d1,d2)
                    result = _np.swapaxes(_np.dot(S_inv, left + right), 1, 2)  # shape (d1, d2, n)
                    result = result.reshape((d**2, n))  # must copy b/c non-contiguous
                    my_jacMx[start:start + d**2] = wt * result
                    start += d**2

            # -- prep terms
            # -------------------------
            for lbl, rho in mdl_post.preps.items():
                # d(rho_term) = -(S_inv * dS * S_inv) * rho
                #   Note: (S_inv * rho) is transformed rho
                wt = item_weights.get(lbl, spamWeight)
                Sinv_dS = _np.dot(S_inv, dS)  # shape (d1,n,d2)
                result = -1 * _np.dot(Sinv_dS, rho.to_dense(on_space='minimal'))  # shape (d,n)
                my_jacMx[start:start + d] = wt * result
                start += d

            # -- effect terms
            # -------------------------
            for povmlbl, povm in mdl_pre.povms.items():
                for lbl, E in povm.items():
                    # d(ET_term) = E.T * dS
                    wt = item_weights.get(povmlbl + "_" + lbl, spamWeight)
                    result = _np.dot(E.to_dense(on_space='minimal')[None, :], dS).T  # shape (1,n,d2).T => (d2,n,1)
                    my_jacMx[start:start + d] = wt * result.squeeze(2)  # (d2,n)
                    start += d

            # -- penalty terms  -- Note: still use original gauge transform applied to `model`
            # -------------------------
            if cptp_penalty_factor > 0 or spam_penalty_factor > 0:
                if frobenius_transform_target:  # reset back to non-target-tranform "mode"
                    gauge_group_el = original_gauge_group_el
                    mdl_pre = model.copy()
                    mdl_post = mdl_pre.copy()
                    mdl_post.transform_inplace(gauge_group_el)

                if cptp_penalty_factor > 0:
                    start += _cptp_penalty_jac_fill(my_jacMx[start:], mdl_pre, mdl_post,
                                                    gauge_group_el, cptp_penalty_factor,
                                                    mdl_pre.basis, wrtIndices)

                if spam_penalty_factor > 0:
                    start += _spam_penalty_jac_fill(my_jacMx[start:], mdl_pre, mdl_post,
                                                    gauge_group_el, spam_penalty_factor,
                                                    mdl_pre.basis, wrtIndices)

            #At this point, each proc has filled the portions (columns) of jacMx that
            # it's responsible for, and so now we gather them together.
            _mpit.gather_slices(derivSlices, derivOwners, jacMx, [], 1, comm)
            #Note jacMx is completely filled (on all procs)

            if check_jac and (comm is None or comm.Get_rank() == 0):
                def _mock_objective_fn(v):
                    return _objective_fn(gauge_group_el, False)
                vec = gauge_group_el.to_vector()
                _opt.check_jac(_mock_objective_fn, vec, jacMx, tol=1e-5, eps=1e-9, err_type='abs',
                               verbosity=1)

            return jacMx
Пример #12
0
def mpidot(a, b, loc_row_slice, loc_col_slice, slice_tuples_by_rank, comm,
           out=None, out_shm=None):
    """
    Performs a distributed dot product, dot(a,b).

    Parameters
    ----------
    a : numpy.ndarray
        First array to dot together.

    b : numpy.ndarray
        Second array to dot together.

    loc_row_slice, loc_col_slice : slice
        Specify the row or column indices, respectively, of the
        resulting dot product that are computed by this processor (the
        rows of `a` and columns of `b` that are used). Obtained from
        :func:`distribute_for_dot`.

    slice_tuples_by_rank : list
        A list of (row_slice, col_slice) tuples, one per processor within this
        processors broadcast group, ordered by rank.  Provided by :func:`distribute_for_dot`.

    comm : mpi4py.MPI.Comm or ResourceAllocation or None
        The communicator used to parallelize the dot product.  If a
        :class:`ResourceAllocation` object is given, then a shared
        memory result will be returned when appropriate.

    out : numpy.ndarray, optional
        If not None, the array to use for the result.  This should be the
        same type of array (size, and whether it's shared or not) as this
        function would have created if `out` were `None`.

    out_shm : multiprocessing.shared_memory.SharedMemory, optinal
        The shared memory object corresponding to `out` when it uses
        shared memory.

    Returns
    -------
    result : numpy.ndarray
        The resulting array
    shm : multiprocessing.shared_memory.SharedMemory
        A shared memory object needed to cleanup the shared memory.  If
        a normal array is created, this is `None`.  Provide this to
        :function:`cleanup_shared_ndarray` to ensure `ar` is deallocated properly.
    """
    # R_ij = sum_k A_ik * B_kj
    from ..baseobjs.resourceallocation import ResourceAllocation as _ResourceAllocation
    if isinstance(comm, _ResourceAllocation):
        ralloc = comm
        comm = ralloc.comm
    else:
        ralloc = None

    if comm is None or comm.Get_size() == 1:
        return _np.dot(a, b), None

    if out is None:
        if ralloc is None:
            result, result_shm = _np.zeros((a.shape[0], b.shape[1]), a.dtype), None
        else:
            result, result_shm = _smt.create_shared_ndarray(ralloc, (a.shape[0], b.shape[1]), a.dtype,
                                                            zero_out=True)
    else:
        result = out
        result_shm = out_shm

    rshape = (_slct.length(loc_row_slice), _slct.length(loc_col_slice))
    loc_result_flat = _np.empty(rshape[0] * rshape[1], a.dtype)
    loc_result = loc_result_flat.view(); loc_result.shape = rshape
    loc_result[:, :] = _np.dot(a[loc_row_slice, :], b[:, loc_col_slice])

    # broadcast_com defines the group of processors this processor communicates with.
    # Without shared memory, this is *all* the other processors.  With shared memory, this
    # is one processor on each host.  This code is identical to that in distribute_for_dot.
    if ralloc is None:
        broadcast_comm = comm
    else:
        broadcast_comm = comm if (ralloc.interhost_comm is None) else ralloc.interhost_comm

    comm.barrier()  # wait for all ranks to do their work (get their loc_result)
    for r, (cur_row_slice, cur_col_slice) in enumerate(slice_tuples_by_rank):
        # for each member of the group that will communicate results
        cur_shape = (_slct.length(cur_row_slice), _slct.length(cur_col_slice))
        buf = loc_result_flat if (broadcast_comm.rank == r) else _np.empty(cur_shape[0] * cur_shape[1], a.dtype)
        broadcast_comm.Bcast(buf, root=r)
        if broadcast_comm.rank != r: buf.shape = cur_shape
        else: buf = loc_result  # already of correct shape
        result[cur_row_slice, cur_col_slice] = buf
    comm.barrier()  # wait for all ranks to finish writing to result

    #assert(_np.linalg.norm(_np.dot(a,b) - result)/(_np.linalg.norm(result) + result.size) < 1e-6),\
    #    "DEBUG: %g, %g, %d" % (_np.linalg.norm(_np.dot(a,b) - result), _np.linalg.norm(result), result.size)
    return result, result_shm
Пример #13
0
def gather_indices(indices, index_owners, ar_to_fill, ar_to_fill_inds,
                   axes, comm, max_buffer_size=None):
    """
    Gathers data within a numpy array, `ar_to_fill`, according to given indices.

    Upon entry it is assumed that the different processors within `comm` have
    computed different parts of `ar_to_fill`, namely different slices or
    index-arrays of the `axis`-th axis.  At exit, data has been gathered such
    that all processors have the results for the entire `ar_to_fill` (or at least
    for all the indices given).

    Parameters
    ----------
    indices : list
        A list of all the integer-arrays or slices (computed by *any* of
        the processors, not just the current one).  Each element of `indices`
        may be either a single slice/index-array or a tuple of such
        elements (when gathering across multiple dimensions).

    index_owners : dict
        A dictionary mapping the index of an element within `slices` to an
        integer rank of the processor responsible for communicating that
        slice/index-array's data to the rest of the processors.

    ar_to_fill : numpy.ndarray
        The array which contains partial data upon entry and the gathered
        data upon exit.

    ar_to_fill_inds : list
        A list of slice or index-arrays specifying the (fixed) sub-array of
        `ar_to_fill` that should be gathered into.  The elements of
        `ar_to_fill_inds` are taken to be indices for the leading dimension
        first, and any unspecified dimensions or `None` elements are
        assumed to be unrestricted (as if `slice(None,None)`).  Note that
        the combination of `ar_to_fill` and `ar_to_fill_inds` is essentally like
        passing `ar_to_fill[ar_to_fill_inds]` to this function, except it will
        work with index arrays as well as slices.

    axes : int or tuple of ints
        The axis or axes of `ar_to_fill` on which the slices apply (which axis
        do the elements of `indices` refer to?).  Note that `len(axes)` must
        be equal to the number of sub-indices (i.e. the tuple length) of each
        element of `indices`.

    comm : mpi4py.MPI.Comm or None
        The communicator specifying the processors involved and used
        to perform the gather operation.

    max_buffer_size : int or None
        The maximum buffer size in bytes that is allowed to be used
        for gathering data.  If None, there is no limit.

    Returns
    -------
    None
    """
    if comm is None: return  # no gathering needed!

    #Perform broadcasts for each slice in order
    my_rank = comm.Get_rank()
    arIndx = [slice(None, None)] * ar_to_fill.ndim
    arIndx[0:len(ar_to_fill_inds)] = ar_to_fill_inds

    axes = (axes,) if _compat.isint(axes) else axes

    max_indices = [None] * len(axes)
    if max_buffer_size is not None:  # no maximum of buffer size
        chunkBytes = ar_to_fill.nbytes  # start with the entire array as the "chunk"
        for iaxis, axis in enumerate(axes):
            # Consider restricting the chunk size along the iaxis-th axis.
            #  If we can achieve the desired max_buffer_size by restricting
            #  just along this axis, great.  Otherwise, restrict to at most
            #  1 index along this axis and keep going.
            bytes_per_index = chunkBytes / ar_to_fill.shape[axis]
            max_inds = int(max_buffer_size / bytes_per_index)
            if max_inds == 0:
                max_indices[iaxis] = 1
                chunkBytes /= ar_to_fill.shape[axis]
            else:
                max_indices[iaxis] = max_inds
                break
        else:
            _warnings.warn("gather_indices: Could not achieve max_buffer_size")

    for iIndex, indOrIndTup in enumerate(indices):
        owner = index_owners[iIndex]  # owner's rank
        indTup = (indOrIndTup,) if not isinstance(indOrIndTup, tuple) else indOrIndTup
        assert(len(indTup) == len(axes))

        def to_slice_list(index_array_or_slice):
            """Breaks a slice or index array into a list of slices"""
            if isinstance(index_array_or_slice, slice):
                return [index_array_or_slice]  # easy!

            lst = index_array_or_slice
            if len(lst) == 0: return [slice(0, 0)]

            slc_lst = []
            i = 0; N = len(lst)
            while i < N:
                start = lst[i]
                step = lst[i + 1] - lst[i] if i + 1 < N else None
                while i + 1 < N and lst[i + 1] - lst[i] == step: i += 1
                stop = lst[i] + 1
                slc_lst.append(slice(start, stop, None if step == 1 else step))
                i += 1

            return slc_lst

        #Get the a list of the (sub-)indices along each axis, whose product
        # (along the specified axes) gives the entire block given by slcTup
        axisSlices = []
        for iaxis, axis in enumerate(axes):
            ind = indTup[iaxis]
            sub_slices = []

            #break `ind`, which may be either a single slice or an index array,
            # into a list of slices that are broadcast one at a time (sometimes
            # these `ind_slice` slices themselves need to be broken up further
            # to obey max_buffer_size).
            for islice in to_slice_list(ind):
                if max_indices[iaxis] is None or max_indices[iaxis] >= _slct.length(islice):
                    sub_slices.append(islice)  # arIndx[axis] = slc
                else:
                    sub_slices.extend(_slct.divide(islice, max_indices[iaxis]))
            axisSlices.append(sub_slices)

        for axSlcs in _itertools.product(*axisSlices):
            #create arIndx from per-axis (sub-)slices and broadcast
            for iaxis, axis in enumerate(axes):
                arIndx[axis] = axSlcs[iaxis]

            #broadcast arIndx slice
            buf = _findx(ar_to_fill, arIndx, True) if (my_rank == owner) \
                else _np.empty(_findx_shape(ar_to_fill, arIndx), ar_to_fill.dtype)
            comm.Bcast(buf, root=owner)
            if my_rank != owner: _fas(ar_to_fill, arIndx, buf)
            buf = None  # free buffer mem asap
Пример #14
0
def gather_slices_by_owner(current_slices, ar_to_fill, ar_to_fill_inds,
                           axes, comm, max_buffer_size=None):
    """
    Gathers data within a numpy array, `ar_to_fill`, according to given slices.

    Upon entry it is assumed that the different processors within `comm` have
    computed different parts of `ar_to_fill`, namely different slices of the
    axes indexed by `axes`. At exit, data has been gathered such that all processors
    have the results for the entire `ar_to_fill` (or at least for all the slices
    given).

    Parameters
    ----------
    current_slices : list
        A list of all the slices computed by the *current* processor.
        Each element of `slices` may be either a single slice or a
        tuple of slices (when gathering across multiple dimensions).

    ar_to_fill : numpy.ndarray
        The array which contains partial data upon entry and the gathered
        data upon exit.

    ar_to_fill_inds : list
        A list of slice or index-arrays specifying the (fixed) sub-array of
        `ar_to_fill` that should be gathered into.  The elements of
        `ar_to_fill_inds` are taken to be indices for the leading dimension
        first, and any unspecified dimensions or `None` elements are
        assumed to be unrestricted (as if `slice(None,None)`).  Note that
        the combination of `ar_to_fill` and `ar_to_fill_inds` is essentally like
        passing `ar_to_fill[ar_to_fill_inds]` to this function, except it will
        work with index arrays as well as slices.

    axes : int or tuple of ints
        The axis or axes of `ar_to_fill` on which the slices apply (which axis
        do the slices in `slices` refer to?).  Note that `len(axes)` must
        be equal to the number of slices (i.e. the tuple length) of each
        element of `slices`.

    comm : mpi4py.MPI.Comm or None
        The communicator specifying the processors involved and used
        to perform the gather operation.

    max_buffer_size : int or None
        The maximum buffer size in bytes that is allowed to be used
        for gathering data.  If None, there is no limit.

    Returns
    -------
    None
    """

    #Note: same beginning as gather_slices (TODO: consolidate?)
    if comm is None: return  # no gathering needed!

    #Perform broadcasts for each slice in order
    my_rank = comm.Get_rank()
    arIndx = [slice(None, None)] * ar_to_fill.ndim
    arIndx[0:len(ar_to_fill_inds)] = ar_to_fill_inds

    axes = (axes,) if _compat.isint(axes) else axes

    max_indices = [None] * len(axes)
    if max_buffer_size is not None:  # no maximum of buffer size
        chunkBytes = ar_to_fill.nbytes  # start with the entire array as the "chunk"
        for iaxis, axis in enumerate(axes):
            # Consider restricting the chunk size along the iaxis-th axis.
            #  If we can achieve the desired max_buffer_size by restricting
            #  just along this axis, great.  Otherwise, restrict to at most
            #  1 index along this axis and keep going.
            bytes_per_index = chunkBytes / ar_to_fill.shape[axis]
            max_inds = int(max_buffer_size / bytes_per_index)
            if max_inds == 0:
                max_indices[iaxis] = 1
                chunkBytes /= ar_to_fill.shape[axis]
            else:
                max_indices[iaxis] = max_inds
                break
        else:
            _warnings.warn("gather_slices_by_owner: Could not achieve max_buffer_size")
    # -- end part that is the same as gather_slices

    #Get a list of the slices to broadcast, indexed by the rank of the owner proc
    slices_by_owner = comm.allgather(current_slices)
    for owner, slices in enumerate(slices_by_owner):
        for slcOrSlcTup in slices:
            slcTup = (slcOrSlcTup,) if isinstance(slcOrSlcTup, slice) else slcOrSlcTup
            assert(len(slcTup) == len(axes))

            #Get the a list of the (sub-)slices along each axis, whose product
            # (along the specified axes) gives the entire block given by slcTup
            axisSlices = []
            for iaxis, axis in enumerate(axes):
                slc = slcTup[iaxis]
                if max_indices[iaxis] is None or max_indices[iaxis] >= _slct.length(slc):
                    axisSlices.append([slc])  # arIndx[axis] = slc
                else:
                    axisSlices.append(_slct.divide(slc, max_indices[iaxis]))

            for axSlcs in _itertools.product(*axisSlices):
                #create arIndx from per-axis (sub-)slices and broadcast
                for iaxis, axis in enumerate(axes):
                    arIndx[axis] = axSlcs[iaxis]

                #broadcast arIndx slice
                buf = _findx(ar_to_fill, arIndx, True) if (my_rank == owner) \
                    else _np.empty(_findx_shape(ar_to_fill, arIndx), ar_to_fill.dtype)
                comm.Bcast(buf, root=owner)
                if my_rank != owner: _fas(ar_to_fill, arIndx, buf)
                buf = None  # free buffer mem asap
Пример #15
0
def gather_slices(slices, slice_owners, ar_to_fill,
                  ar_to_fill_inds, axes, comm, max_buffer_size=None):
    """
    Gathers data within a numpy array, `ar_to_fill`, according to given slices.

    Upon entry it is assumed that the different processors within `comm` have
    computed different parts of `ar_to_fill`, namely different slices of the
    `axis`-th axis.  At exit, data has been gathered such that all processors
    have the results for the entire `ar_to_fill` (or at least for all the slices
    given).

    Parameters
    ----------
    slices : list
        A list of all the slices (computed by *any* of the processors, not
        just the current one).  Each element of `slices` may be either a
        single slice or a tuple of slices (when gathering across multiple
        dimensions).

    slice_owners : dict
        A dictionary mapping the index of a slice (or tuple of slices)
        within `slices` to an integer rank of the processor responsible
        for communicating that slice's data to the rest of the processors.

    ar_to_fill : numpy.ndarray
        The array which contains partial data upon entry and the gathered
        data upon exit.

    ar_to_fill_inds : list
        A list of slice or index-arrays specifying the (fixed) sub-array of
        `ar_to_fill` that should be gathered into.  The elements of
        `ar_to_fill_inds` are taken to be indices for the leading dimension
        first, and any unspecified dimensions or `None` elements are
        assumed to be unrestricted (as if `slice(None,None)`).  Note that
        the combination of `ar_to_fill` and `ar_to_fill_inds` is essentally like
        passing `ar_to_fill[ar_to_fill_inds]` to this function, except it will
        work with index arrays as well as slices.

    axes : int or tuple of ints
        The axis or axes of `ar_to_fill` on which the slices apply (which axis
        do the slices in `slices` refer to?).  Note that `len(axes)` must
        be equal to the number of slices (i.e. the tuple length) of each
        element of `slices`.

    comm : mpi4py.MPI.Comm or ResourceAllocation or None
        The communicator specifying the processors involved and used
        to perform the gather operation.  If a :class:`ResourceAllocation`
        is provided, then inter-host communication is used when available
        to facilitate use of shared intra-host memory.

    max_buffer_size : int or None
        The maximum buffer size in bytes that is allowed to be used
        for gathering data.  If None, there is no limit.

    Returns
    -------
    None
    """
    from ..baseobjs.resourceallocation import ResourceAllocation as _ResourceAllocation
    if isinstance(comm, _ResourceAllocation):
        ralloc = comm
        comm = ralloc.comm

        #For use with shared intra-host (intra-node) memory:
        # my_interhost_ranks = ranks of comm, 1 per host, that this processor uses to send/receive data between hosts
        # broadcast_comm = the comm of my_interhost_ranks used to send/receive data.
        if ralloc.interhost_ranks is not None:
            my_interhost_ranks = set(ralloc.interhost_ranks)
            broadcast_rank_map = {comm_rank: broadcast_comm_rank
                                  for broadcast_comm_rank, comm_rank in enumerate(ralloc.interhost_ranks)}
            broadcast_comm = ralloc.interhost_comm
        else:
            my_interhost_ranks = None
            broadcast_rank_map = {i: i for i in range(comm.Get_size())} if (comm is not None) else {0: 0}  # trivial map
            broadcast_comm = comm
    else:
        ralloc = None
        my_interhost_ranks = None
        broadcast_rank_map = {i: i for i in range(comm.Get_size())} if (comm is not None) else {0: 0}  # trivial map
        broadcast_comm = comm

    if comm is None: return  # no gathering needed!

    # To be safe, since use of broadcast_comm below means we don't always need to wait for all procs
    # to finish what they were doing last, which could involve updating a shared ar_to_fill so that
    # values accessed by the already-finished front-running processors are affected!
    comm.barrier()

    #Perform broadcasts for each slice in order
    my_rank = comm.Get_rank()

    axes = (axes,) if _compat.isint(axes) else axes

    #print("DB: Rank %d (%d): BEGIN GATHER SLICES: interhost=%s, group=%s" %
    #      (my_rank, broadcast_comm.rank, str(my_interhost_ranks), str(broadcast_comm.Get_group())))

    # # if ar_to_fill_inds only contains slices (or is empty), then we can slice ar_to_fill once up front
    # # and not use generic arIndx in loop below (slower, especially with lots of procs)
    # if all([isinstance(indx, slice) for indx in ar_to_fill_inds]):
    #     ar_to_fill = ar_to_fill[tuple(ar_to_fill_inds)]  # Note: this *doesn't* reduce its .ndim
    #     ar_to_fill_inds = ()  # now ar_to_fill requires no further indexing

    arIndx = [slice(None, None)] * ar_to_fill.ndim
    arIndx[0:len(ar_to_fill_inds)] = ar_to_fill_inds
    max_indices = [None] * len(axes)
    if max_buffer_size is not None:  # no maximum of buffer size
        chunkBytes = ar_to_fill.nbytes  # start with the entire array as the "chunk"
        for iaxis, axis in enumerate(axes):
            # Consider restricting the chunk size along the iaxis-th axis.
            #  If we can achieve the desired max_buffer_size by restricting
            #  just along this axis, great.  Otherwise, restrict to at most
            #  1 index along this axis and keep going.
            bytes_per_index = chunkBytes / ar_to_fill.shape[axis]
            max_inds = int(max_buffer_size / bytes_per_index)
            if max_inds == 0:
                max_indices[iaxis] = 1
                chunkBytes /= ar_to_fill.shape[axis]
            else:
                max_indices[iaxis] = max_inds
                break
        else:
            _warnings.warn("gather_slices: Could not achieve max_buffer_size")

    # NOTE: Tried doing something faster (Allgatherv) when slices elements are simple slices (not tuples of slices).
    # This ultimately showed that our repeated use of Bcast isn't any slower than fewer calls to Allgatherv,
    # and since the Allgatherv case complicates the code and ignores the memory limit, it's best to just drop it.

    # Broadcast slices one-by-one (slower, but more general):
    for iSlice, slcOrSlcTup in enumerate(slices):
        owner = slice_owners[iSlice]  # owner's rank
        if my_interhost_ranks is not None and owner not in my_interhost_ranks:
            # if the "source" (owner) of the data isn't a part of my "circle" of ranks, then we
            # don't need to send or receive this data - other ranks on the same hosts will do it.
            continue

        slcTup = (slcOrSlcTup,) if isinstance(slcOrSlcTup, slice) else slcOrSlcTup
        assert(len(slcTup) == len(axes))

        #Get the a list of the (sub-)slices along each axis, whose product
        # (along the specified axes) gives the entire block given by slcTup
        axisSlices = []
        for iaxis, axis in enumerate(axes):
            slc = slcTup[iaxis]
            if max_indices[iaxis] is None or max_indices[iaxis] >= _slct.length(slc):
                axisSlices.append([slc])  # arIndx[axis] = slc
            else:
                axisSlices.append(_slct.divide(slc, max_indices[iaxis]))

        for axSlcs in _itertools.product(*axisSlices):
            #create arIndx from per-axis (sub-)slices and broadcast
            for iaxis, axis in enumerate(axes):
                arIndx[axis] = axSlcs[iaxis]

            #broadcast arIndx slice
            buf = _findx(ar_to_fill, arIndx, True) if (my_rank == owner) \
                else _np.empty(_findx_shape(ar_to_fill, arIndx), ar_to_fill.dtype)
            if my_interhost_ranks is None or len(my_interhost_ranks) > 1:
                #print("DB: Rank %d (%d) Broadcast: arIndx = %s, owner=%d root=%d" %
                #      (my_rank, broadcast_comm.rank, str(arIndx), owner, broadcast_rank_map[owner]))
                broadcast_comm.Bcast(buf, root=broadcast_rank_map[owner])
                if my_rank != owner: _fas(ar_to_fill, arIndx, buf)
            buf = None  # free buffer mem asap
    #print("DB: Rank %d: END GATHER SLICES" % my_rank)

    # Important: wait for everything to finish before proceeding
    #  (when broadcast_comm != comm some procs may run ahead - see comment above)
    comm.barrier()
Пример #16
0
def distribute_slice(s, comm, allow_split_comm=True):
    """
    Partition a continuous slice evenly among `comm`'s processors.

    This function is similar to :func:`distribute_indices`, but
    is specific to the case when the indices being distributed
    are a consecutive set of integers (specified by a slice).

    Parameters
    ----------
    s : slice
        The slice to be partitioned.

    comm : mpi4py.MPI.Comm or ResourceAllocation
        The communicator which specifies the number of processors and
        which may be split into returned sub-communicators.  If a
        :class:`ResourceAllocation` object, node information is also
        taken into account when available (for shared memory compatibility).

    allow_split_comm : bool
        If True, when there are more processors than slice indices,
        multiple processors will be given the *same* local slice
        and `comm` will be split into sub-communicators, one for each
        group of processors that are given the same local slice.
        If False, then "extra" processors are simply given
        nothing to do, i.e. an empty local slice.

    Returns
    -------
    slices : list of slices
        The list of *unique* slices assigned to different processors.  It's
        possible that a single slice (i.e. element of `slices`) is assigned
        to multiple processors (when there are more processors than indices
        in `s`.
    loc_slice : slice
        A slice specifying the indices belonging to the current processor.
    owners : dict
        A dictionary giving the owning rank of each slice.  Values are integer
        ranks and keys are integers into `slices`, specifying which slice.
    loc_comm : mpi4py.MPI.Comm or ResourceAllocation or None
        The local communicator/ResourceAllocation for the group of processors
        which have been given the same `loc_slice` to compute, obtained by
        splitting `comm`.  If `loc_slice` is unique to the current processor,
        or if `allow_split_comm` is False, None is returned.
    """
    from ..baseobjs.resourceallocation import ResourceAllocation as _ResourceAllocation
    if isinstance(comm, _ResourceAllocation):
        ralloc = comm
        comm = ralloc.comm
    else:
        ralloc = None

    if comm is None:
        nprocs, rank = 1, 0
    else:
        nprocs = comm.Get_size()
        rank = comm.Get_rank()

    slices = slice_up_slice(s, min(nprocs, _slct.length(s)))
    assert(len(slices) <= nprocs)
    loc_iSlices, slcOwners, _ = \
        distribute_indices_base(list(range(len(slices))), nprocs, rank,
                                allow_split_comm)
    assert(len(loc_iSlices) <= 1)  # should not assign more than one slice to
    # each proc by design (there are only nprocs slices)

    if len(loc_iSlices) == 1:
        loc_slice = slices[loc_iSlices[0]]

        #Split comm into sub-comms when there are more procs than
        # indices, resulting in all procs getting only a
        # single index and multiple procs getting the *same*
        # (single) index.
        if nprocs > _slct.length(s) and (comm is not None) and allow_split_comm:
            loc_comm = comm.Split(color=loc_iSlices[0], key=rank)
        else:
            loc_comm = None
    else:  # len(loc_iSlices) == 0 (nothing for this proc to do)
        loc_slice = slice(0, 0)
        loc_comm = None

    if ralloc is not None:  # then return a ResourceAllocation instead of a comm
        loc_comm = _ResourceAllocation(loc_comm, ralloc.mem_limit, ralloc.profiler,
                                       ralloc.distribute_method, ralloc.allocated_memory)
        if ralloc.host_comm is not None:
            loc_comm.build_hostcomms()  # signals that we want to use shared intra-host memory

    return slices, loc_slice, slcOwners, loc_comm
Пример #17
0
    def _bulk_fill_timedep_deriv(self, layout, dataset, ds_circuits, num_total_outcomes,
                                 deriv_array_to_fill, deriv_fill_fn, array_to_fill=None,
                                 fill_fn=None):
        """
        A helper method for computing (filling) the derivative of a time-dependent quantity.

        A generic method providing the scaffolding used when computing (filling) the
        derivative of a time-dependent quantity.  In particular, it distributes the
        computation among the subtrees of `eval_tree` and relies on the caller to supply
        "compute_cache" and "compute_dcache" functions which just need to compute the
        quantitiy being filled and its derivative given a sub-tree and a parameter-slice.

        Parameters
        ----------
        layout : TermCOPALayout
            The layout specifiying the quantities (circuit outcome probabilities) to be
            computed, and related information.

        dataset : DataSet
            the data set passed on to the computation functions.

        ds_circuits : list of Circuits
            the circuits to use as they should be queried from `dataset` (see
            below).  This is typically the same list of circuits used to
            construct `layout` potentially with some aliases applied.

        num_total_outcomes : list or array
            a list of the total number of *possible* outcomes for each circuit
            (so `len(num_total_outcomes) == len(ds_circuits)`).  This is
            needed for handling sparse data, where `dataset` may not contain
            counts for all the possible outcomes of each circuit.

        deriv_array_to_fill : numpy ndarray
            an already-allocated ExM numpy array where E is the total number of
            computed elements (i.e. layout.num_elements) and M is the
            number of model parameters.

        deriv_fill_fn : function
            a function used to compute the objective funtion jacobian.

        array_to_fill : numpy array, optional
            when not None, an already-allocated length-E numpy array that is filled
            with the per-circuit contributions computed using `fn` below.

        fill_fn : function, optional
            a function used to compute the objective function.

        Returns
        -------
        None
        """
        #Note: this function is similar to _bulk_fill_dprobs, and may be able to consolidated in the FUTURE.

        blkSize = layout.param_dimension_blk_sizes[0]
        atom_resource_alloc = layout.resource_alloc('atom-processing')
        param_resource_alloc = layout.resource_alloc('param-processing')

        assert(atom_resource_alloc.host_comm is None), \
            "Shared memory is not supported in time-dependent calculations (yet)"

        host_param_slice = layout.host_param_slice
        global_param_slice = layout.global_param_slice

        for atom in layout.atoms:
            elInds = atom.element_slice

            #NOTE: this block uses atom.orig_indices_by_expcircuit, which is specific to _MapCOPALayoutAtom - TODO
            dataset_rows = {i_expanded: dataset[ds_circuits[i]]
                            for i_expanded, i in atom.orig_indices_by_expcircuit.items()}
            num_outcomes = {i_expanded: num_total_outcomes[i]
                            for i_expanded, i in atom.orig_indices_by_expcircuit.items()}

            if array_to_fill is not None:
                fill_fn(array_to_fill, elInds, num_outcomes, atom, dataset_rows, atom_resource_alloc)

            if blkSize is None:  # wrt_filter gives entire computed parameter block
                #Fill derivative cache info
                deriv_fill_fn(deriv_array_to_fill, elInds, host_param_slice, num_outcomes, atom,
                              dataset_rows, global_param_slice, param_resource_alloc)
                #profiler.mem_check("bulk_fill_dprobs: post fill")

            else:  # Divide columns into blocks of at most blkSize
                Np = _slct.length(host_param_slice)  # total number of parameters we're computing
                nBlks = int(_np.ceil(Np / blkSize))
                # num blocks required to achieve desired average size == blkSize
                blocks = _mpit.slice_up_range(Np, nBlks)

                for block in blocks:
                    host_param_slice_part = _slct.shift(block, host_param_slice.start)  # into host's memory
                    global_param_slice_part = _slct.shift(block, global_param_slice.start)  # actual parameter indices
                    deriv_fill_fn(deriv_array_to_fill, elInds, host_param_slice_part, num_outcomes, atom,
                                  dataset_rows, global_param_slice_part, param_resource_alloc)
Пример #18
0
    def _bulk_fill_hprobs(self, array_to_fill, layout,
                          pr_array_to_fill, deriv1_array_to_fill, deriv2_array_to_fill):
        """Note: we expect that array_to_fill points to the memory specifically for this processor
           (a subset of the memory for the host when memory is shared) """
        blkSize1 = layout.param_dimension_blk_sizes[0]
        blkSize2 = layout.param_dimension_blk_sizes[1]

        #Assume we're being called with a resource_alloc that's been setup by a distributed layout:
        atom_resource_alloc = layout.resource_alloc('atom-processing')
        param_resource_alloc = layout.resource_alloc('param-processing')
        param2_resource_alloc = layout.resource_alloc('param2-processing')

        atom_resource_alloc.host_comm_barrier()  # ensure all procs have finished w/shared memory before we reinit
        # Note: use *largest* host comm that we fill - so 'atom' comm, not 'param' comm

        host_param_slice = None  # layout.host_param_slice  # array_to_fill is already just this slice of the host mem
        host_param2_slice = None  # layout.host_param2_slice  # array_to_fill is already just this slice of the host mem
        global_param_slice = layout.global_param_slice
        global_param2_slice = layout.global_param2_slice

        for atom in layout.atoms:

            if pr_array_to_fill is not None:
                self._bulk_fill_probs_atom(pr_array_to_fill[atom.element_slice], atom, atom_resource_alloc)

            if blkSize1 is None and blkSize2 is None:  # run 'else' block without unnecessary logic
                #Compute all our derivative columns at once
                if deriv1_array_to_fill is not None:
                    self._bulk_fill_dprobs_atom(deriv1_array_to_fill[atom.element_slice, :], host_param_slice,
                                                atom, global_param_slice, param_resource_alloc)
                if deriv2_array_to_fill is not None:
                    if deriv1_array_to_fill is not None and global_param_slice == global_param2_slice:
                        deriv2_array_to_fill[atom.element_slice, :] = deriv1_array_to_fill[atom.element_slice, :]
                    else:
                        self._bulk_fill_dprobs_atom(deriv2_array_to_fill[atom.element_slice, :], host_param2_slice,
                                                    atom, global_param2_slice, param2_resource_alloc)

                self._bulk_fill_hprobs_atom(array_to_fill[atom.element_slice, :, :], host_param_slice,
                                            host_param2_slice, atom, global_param_slice, global_param2_slice,
                                            param2_resource_alloc)

            else:  # Divide columns into blocks of at most shape (blkSize1, blkSize2)
                assert(blkSize1 is not None and blkSize2 is not None), \
                    "Both (or neither) of the Hessian block sizes must be specified!"
                Np1 = _slct.length(global_param_slice)
                Np2 = _slct.length(global_param2_slice)
                nBlks1 = int(_np.ceil(Np1 / blkSize1))
                nBlks2 = int(_np.ceil(Np2 / blkSize2))
                # num blocks required to achieve desired average size == blkSize1 or blkSize2
                blocks1 = _mpit.slice_up_range(Np1, nBlks1)
                blocks2 = _mpit.slice_up_range(Np2, nBlks2)

                for block1 in blocks1:
                    host_param_slice_part = block1  # _slct.shift(block1, host_param_slice.start)  # into host's memory
                    global_param_slice_part = _slct.shift(block1, global_param_slice.start)  # actual parameter indices

                    if deriv1_array_to_fill is not None:
                        self._bulk_fill_dprobs_atom(deriv1_array_to_fill[atom.element_slice, :], host_param_slice_part,
                                                    atom, global_param_slice_part, param_resource_alloc)

                    for block2 in blocks2:
                        host_param2_slice_part = block2  # into host's memory
                        global_param2_slice_part = _slct.shift(block2, global_param2_slice.start)  # parameter indices
                        self._bulk_fill_hprobs_atom(array_to_fill[atom.element_slice, :],
                                                    host_param_slice_part, host_param2_slice_part, atom,
                                                    global_param_slice_part, global_param2_slice_part,
                                                    param2_resource_alloc)

                #Fill deriv2_array_to_fill if we need to.
                if deriv2_array_to_fill is not None:
                    if deriv1_array_to_fill is not None and global_param_slice == global_param2_slice:
                        deriv2_array_to_fill[atom.element_slice, :] = deriv1_array_to_fill[atom.element_slice, :]
                    else:
                        for block2 in blocks2:
                            host_param2_slice_part = block2  # into host's memory
                            global_param2_slice_part = _slct.shift(block2, global_param2_slice.start)  # param indices
                            self._bulk_fill_dprobs_atom(deriv2_array_to_fill[atom.element_slice, :],
                                                        host_param2_slice_part, atom,
                                                        global_param2_slice_part, param_resource_alloc)

        atom_resource_alloc.host_comm_barrier()  # don't exit until all procs' array_to_fill is ready