Exemplo n.º 1
0
    def batch_compute(self, events, n_jobs=None):
        """Computes the value of the observable on several events.

        **Arguments**

        - **events** : array_like or `fastjet.PseudoJet`
            - The events as an array of arrays of particles in coordinates
            matching those anticipated by `coords`.
        - **n_jobs** : _int_ or `None`
            - The number of worker processes to use. A value of `None` will
            use as many processes as there are CPUs on the machine.

        **Returns**

        - _1-d numpy.ndarray_
            - A vector of the observable values for each event.
        """

        if n_jobs is None:
            n_jobs = multiprocessing.cpu_count() or 1
        self.n_jobs = n_jobs

        # don't bother setting up a Pool
        if self.n_jobs == 1:
            return np.asarray(list(map(self._batch_compute_func, events)))

        # setup processor pool
        chunksize = min(max(len(events) // self.n_jobs, 1), 10000)
        with create_pool(self.n_jobs) as pool:
            results = np.asarray(
                list(pool.map(self._batch_compute_func, events, chunksize)))

        return results
Exemplo n.º 2
0
    def emds(X0, X1=None, R=1.0, norm=False, gdim=None, n_iter_max=100000,
                          periodic_phi=False, phi_col=2,
                          n_jobs=None, verbose=0, print_every=10**6):
        r"""Compute the EMD between collections of events. This can be used to
        compute EMDs between all pairs of events in a set or between events in
        two difference sets.

        **Arguments**

        - **X0** : _list_
            - Iterable collection of events. Each event is assumed to be an 
            `(M,1+gdim)` array of particles, where `M` is the multiplicity and 
            `gdim` is the dimension of the ground space in which to compute
            euclidean distances between particles (specified by the `gdim`
            keyword argument). The zeroth column is assumed to be the energies
            (or equivalently, the transverse momenta) of the particles. For
            typical hadron collider jet applications, each particle will be of
            the form `(pT,y,phi)` where  `y` is the rapidity and `phi` is the
            azimuthal angle.
        - **X1** : _list_ or `None`
            - Iterable collection of events in the same format as `X0`, 
            or `None`. If the latter, the pairwise distances between events
            in `X0` will be computed and the returned matrix will be symmetric.
       - **R** : _float_
            - The R parameter in the EMD definition that controls the relative 
            importance of the two terms. Must be greater than or equal to half 
            of the maximum ground distance in the space in order for the EMD 
            to be a valid metric.
        - **norm** : _bool_
            - Whether or not to normalize the pT values of the events prior to 
            computing the EMD.
        - **gdim** : _int_
            - The dimension of the ground metric space. Useful for restricting
            which dimensions are considered part of the ground space. Can be
            larger than the number of dimensions present in the events (in
            which case all dimensions will be included). If `None`, has no
            effect.
        - **n_iter_max** : _int_
            - Maximum number of iterations for solving the optimal transport 
            problem.
        - **periodic_phi** : _bool_
            - Whether to expect (and therefore properly handle) periodicity
            in the coordinate corresponding to the azimuthal angle $\phi$.
            Should typically be `True` for event-level applications but can
            be set to `False` (which is slightly faster) for jet applications
            where all $\phi$ differences are less than or equal to $\pi$.
        - **phi_col** : _int_
            - The index of the column of $\phi$ values in the event array.
        - **n_jobs** : _int_ or `None`
            - The number of worker processes to use. A value of `None` will use 
            as many processes as there are CPUs on the machine. Note that for
            smaller numbers of events, a smaller value of `n_jobs` can be faster.
        - **verbose** : _int_
            - Controls the verbosity level. A value greater than `0` will print
            the progress of the computation at intervals specified by `print_every`.
        - **print_every** : _int_
            - The number of computations to do in between printing the progress.
            Even if the verbosity level is zero, this still plays a role in 
            determining when the worker processes report the results back to the
            main process.

        **Returns**

        - _numpy.ndarray_
            - The EMD values as a two-dimensional array. If `X1` was `None`, then 
            the shape will be `(len(X0), len(X0))` and the array will be symmetric,
            otherwise it will have shape `(len(X0), len(X1))`.
        """

        _check_params(norm, gdim, phi_col)

        # determine if we're doing symmetric pairs
        sym = X1 is None

        # period handling
        phi_col_m1 = phi_col - 1

        # process events into convenient form for EMD
        X0 = [_process_for_emd(x, norm, gdim, periodic_phi, phi_col_m1) for x in X0]
        X1 = X0 if sym else [_process_for_emd(x, norm, gdim, periodic_phi, phi_col_m1) for x in X1]

        # get iterator for indices
        pairs = (itertools.combinations(range(len(X0)), r=2) if sym else 
                 itertools.product(range(len(X0)), range(len(X1))))
        npairs = len(X0)*(len(X0)-1)//2 if sym else len(X0)*len(X1)

        # handle kwarg options
        if isinstance(print_every, float):
            print_every = int(npairs*print_event)
        if n_jobs is None:
            n_jobs = multiprocessing.cpu_count() or 1

        # setup container for EMDs
        emds = np.zeros((len(X0), len(X1)))

        # use some number of worker processes to calculate EMDs
        start = time.time()
        if n_jobs != 1:

            # verbose printing
            if verbose >= 1:
                print('Using {} worker process{}:'.format(n_jobs, 'es' if n_jobs > 1 else ''))

            # create process pool
            with create_pool(n_jobs) as pool:

                # iterate over pairs of events
                begin = end = 0
                other_params = [X0, X1, R, norm, n_iter_max, periodic_phi, phi_col_m1]
                imap_args = ([pair, other_params] for pair in pairs)
                while end < npairs:
                    end += print_every
                    end = min(end, npairs)
                    chunksize = max(1, (end - begin)//n_jobs)

                    # only hold this many pairs in memory
                    local_imap_args = [next(imap_args) for i in range(end - begin)]

                    # map function and store results
                    results = list(pool.map(_emd4imap, local_imap_args, chunksize=chunksize))
                    for arg,r in zip(local_imap_args, results):
                        i, j = arg[0]
                        emds[i, j] = r

                    # setup for next iteration of while loop
                    begin = end

                    # print update if verbose
                    if verbose >= 1:
                        args = (end, end/npairs*100, time.time() - start)
                        print('  Computed {} EMDs, {:.2f}% done in {:.2f}s'.format(*args))

        # run EMDs in this process
        elif n_jobs == 1:
            for k,(i,j) in enumerate(pairs):
                emds[i, j] = _emd(X0[i], X1[j], R, norm, n_iter_max, periodic_phi, phi_col_m1)
                if verbose >= 1 and (k % print_every) == 0 and k != 0:
                    args = (k, k/npairs*100, time.time() - start)
                    print('Computed {} EMDs, {:.2f}% done in {:.2f}s'.format(*args))

        # unrecognized n_jobs value
        else:
            raise ValueError('n_jobs must be a positive integer or -1')

        # if doing an array with itself, symmetrize the distance matrix
        if sym:
            emds += emds.T

        if verbose >= 1:
            print()

        return emds
Exemplo n.º 3
0
    def emds_pot(X0, X1=None, R=1.0, norm=False, beta=1.0, measure='euclidean', coords='hadronic',
                             gdim=None, mask=False, n_iter_max=100000, 
                             periodic_phi=False, phi_col=2, empty_policy='error',
                             n_jobs=None, verbose=0, print_every=10**6):
        r"""Compute the EMDs between collections of events. This can be used to
        compute EMDs between all pairs of events in a set or between events in
        two different sets.

        **Arguments**

        - **X0** : _list_
            - Iterable collection of events. Each event is assumed to be an 
            `(M,1+gdim)` array of particles, where `M` is the multiplicity and 
            `gdim` is the dimension of the ground space in which to compute
            euclidean distances between particles (specified by the `gdim`
            keyword argument). The zeroth column is assumed to be the energies
            (or equivalently, the transverse momenta) of the particles. For
            typical hadron collider jet applications, each particle will be of
            the form `(pT,y,phi)` where  `y` is the rapidity and `phi` is the
            azimuthal angle.
        - **X1** : _list_ or `None`
            - Iterable collection of events in the same format as `X0`, 
            or `None`. If the latter, the pairwise distances between events
            in `X0` will be computed and the returned matrix will be symmetric.
       - **R** : _float_
            - The R parameter in the EMD definition that controls the relative 
            importance of the two terms. Must be greater than or equal to half 
            of the maximum ground distance in the space in order for the EMD 
            to be a valid metric satisfying the triangle inequality.
        - **norm** : _bool_
            - Whether or not to normalize the pT values of the events prior to 
            computing the EMD.
        - **beta** : _float_
            - The angular weighting exponent. The internal pairwsie distance
            matrix is raised to this power prior to solving the optimal
            transport problem.
        - **measure** : _str_
            - Controls which metric is used to calculate the ground distances
            between particles. `'euclidean'` uses the euclidean metric in
            however many dimensions are provided and specified by `gdim`.
            `'spherical'` uses the opening angle between particles on the
            sphere (note that this is not fully tested and should be used
            cautiously).
        - **coords** : _str_
            - Only has an effect if `measure='spherical'`, in which case it
            controls if `'hadronic'` coordinates `(pT,y,phi,[m])` are expected
            versus `'cartesian'` coordinates `(E,px,py,pz)`.
        - **gdim** : _int_
            - The dimension of the ground metric space. Useful for restricting
            which dimensions are considered part of the ground space. Can be
            larger than the number of dimensions present in the events (in
            which case all dimensions will be included). If `None`, has no
            effect.
        - **mask** : _bool_
            - If `True`, ignores particles farther than `R` away from the
            origin.
        - **n_iter_max** : _int_
            - Maximum number of iterations for solving the optimal transport 
            problem.
        - **periodic_phi** : _bool_
            - Whether to expect (and therefore properly handle) periodicity
            in the coordinate corresponding to the azimuthal angle $\phi$.
            Should typically be `True` for event-level applications but can
            be set to `False` (which is slightly faster) for jet applications
            where all $\phi$ differences are less than or equal to $\pi$.
        - **phi_col** : _int_
            - The index of the column of $\phi$ values in the event array.
        - **empty_policy** : _float_ or `'error'`
            - Controls behavior if an empty event is passed in. When set to
            `'error'`, a `ValueError` is raised if an empty event is
            encountered. If set to a float, that value is returned is returned
            instead on an empty event.
        - **n_jobs** : _int_ or `None`
            - The number of worker processes to use. A value of `None` will use
            as many processes as there are CPUs on the machine. Note that for
            smaller numbers of events, a smaller value of `n_jobs` can be
            faster.
        - **verbose** : _int_
            - Controls the verbosity level. A value greater than `0` will print
            the progress of the computation at intervals specified by
            `print_every`.
        - **print_every** : _int_
            - The number of computations to do in between printing the
            progress. Even if the verbosity level is zero, this still plays a
            role in determining when the worker processes report the results
            back to the main process.

        **Returns**

        - _numpy.ndarray_
            - The EMD values as a two-dimensional array. If `X1` was `None`,
            then the shape will be `(len(X0), len(X0))` and the array will be
            symmetric, otherwise it will have shape `(len(X0), len(X1))`.
        """

        _check_params(norm, gdim, phi_col, measure, coords, empty_policy)
        euclidean = (measure == 'euclidean')
        hadr2cart = (not euclidean) and (coords == 'hadronic')
        error_on_empty = (empty_policy == 'error')

        # determine if we're doing symmetric pairs
        sym = X1 is None

        # period handling
        phi_col_m1 = phi_col - 1

        # process events into convenient form for EMD
        global _X0, _X1
        start = time.time()
        args = (norm, gdim, periodic_phi, phi_col_m1, 
                mask, R, hadr2cart, euclidean, error_on_empty)
        _X0 = [_process_for_emd(x, *args) for x in X0]
        _X1 = _X0 if sym else [_process_for_emd(x, *args) for x in X1]

        # begin printing
        if verbose >= 1:
            n = len(_X0) if sym else len(_X0) + len(_X1)
            s = 'symmetric' if sym else 'asymmetric'
            t = time.time() - start
            print('Processed {} events for {} EMD computation in {:.3f}s'.format(n, s, t))

        # get iterator for indices
        pairs = (itertools.combinations(range(len(_X0)), r=2) if sym else 
                 itertools.product(range(len(_X0)), range(len(_X1))))
        npairs = len(_X0)*(len(_X0)-1)//2 if sym else len(_X0)*len(_X1)

        # handle kwarg options
        if isinstance(print_every, float):
            print_every = int(npairs*print_event)
        if n_jobs is None or n_jobs == -1:
            n_jobs = multiprocessing.cpu_count() or 1

        # setup container for EMDs
        emds = np.zeros((len(_X0), len(_X1)))

        # use some number of worker processes to calculate EMDs
        start = time.time()
        no_norm = not norm
        if n_jobs != 1:

            # verbose printing
            if verbose >= 1:
                print('Using {} worker process{}:'.format(n_jobs, 'es' if n_jobs > 1 else ''))

            # create process pool
            with create_pool(n_jobs) as pool:
                
                params = (R, no_norm, beta, euclidean, n_iter_max, 
                          periodic_phi, phi_col_m1, empty_policy)
                map_args = ((pair, params) for pair in pairs)

                # iterate over pairs of events
                begin = end = 0
                while end < npairs:
                    end += print_every
                    end = min(end, npairs)
                    chunksize, extra = divmod(end - begin, n_jobs * 2)
                    if extra:
                        chunksize += 1

                    # only hold this many pairs in memory
                    local_map_args = [next(map_args) for i in range(end - begin)]

                    # map function and store results
                    results = pool.map(_emd4map, local_map_args, chunksize=chunksize)
                    for arg,r in zip(local_map_args, results):
                        i, j = arg[0]
                        emds[i, j] = r

                    # setup for next iteration of while loop
                    begin = end

                    # print update if verbose
                    if verbose >= 1:
                        args = (end, end/npairs*100, time.time() - start)
                        print('  Computed {} EMDs, {:.2f}% done in {:.2f}s'.format(*args))

        # run EMDs in this process
        elif n_jobs == 1:
            for k,(i,j) in enumerate(pairs):
                emds[i, j] = _emd(_X0[i], _X1[j], R, no_norm, beta, euclidean, 
                                  n_iter_max, periodic_phi, phi_col_m1, empty_policy)

                if verbose >= 1 and ((k+1) % print_every) == 0:
                    args = (k+1, (k+1)/npairs*100, time.time() - start)
                    print('  Computed {} EMDs, {:.2f}% done in {:.2f}s'.format(*args))

        # unrecognized n_jobs value
        else:
            raise ValueError('n_jobs must be a positive integer or -1')

        # delete global arrays
        del _X0, _X1

        # if doing an array with itself, symmetrize the distance matrix
        if sym:
            emds += emds.T

        return emds