def batch_compute(self, events, n_jobs=None): """Computes the value of the observable on several events. **Arguments** - **events** : array_like or `fastjet.PseudoJet` - The events as an array of arrays of particles in coordinates matching those anticipated by `coords`. - **n_jobs** : _int_ or `None` - The number of worker processes to use. A value of `None` will use as many processes as there are CPUs on the machine. **Returns** - _1-d numpy.ndarray_ - A vector of the observable values for each event. """ if n_jobs is None: n_jobs = multiprocessing.cpu_count() or 1 self.n_jobs = n_jobs # don't bother setting up a Pool if self.n_jobs == 1: return np.asarray(list(map(self._batch_compute_func, events))) # setup processor pool chunksize = min(max(len(events) // self.n_jobs, 1), 10000) with create_pool(self.n_jobs) as pool: results = np.asarray( list(pool.map(self._batch_compute_func, events, chunksize))) return results
def emds(X0, X1=None, R=1.0, norm=False, gdim=None, n_iter_max=100000, periodic_phi=False, phi_col=2, n_jobs=None, verbose=0, print_every=10**6): r"""Compute the EMD between collections of events. This can be used to compute EMDs between all pairs of events in a set or between events in two difference sets. **Arguments** - **X0** : _list_ - Iterable collection of events. Each event is assumed to be an `(M,1+gdim)` array of particles, where `M` is the multiplicity and `gdim` is the dimension of the ground space in which to compute euclidean distances between particles (specified by the `gdim` keyword argument). The zeroth column is assumed to be the energies (or equivalently, the transverse momenta) of the particles. For typical hadron collider jet applications, each particle will be of the form `(pT,y,phi)` where `y` is the rapidity and `phi` is the azimuthal angle. - **X1** : _list_ or `None` - Iterable collection of events in the same format as `X0`, or `None`. If the latter, the pairwise distances between events in `X0` will be computed and the returned matrix will be symmetric. - **R** : _float_ - The R parameter in the EMD definition that controls the relative importance of the two terms. Must be greater than or equal to half of the maximum ground distance in the space in order for the EMD to be a valid metric. - **norm** : _bool_ - Whether or not to normalize the pT values of the events prior to computing the EMD. - **gdim** : _int_ - The dimension of the ground metric space. Useful for restricting which dimensions are considered part of the ground space. Can be larger than the number of dimensions present in the events (in which case all dimensions will be included). If `None`, has no effect. - **n_iter_max** : _int_ - Maximum number of iterations for solving the optimal transport problem. - **periodic_phi** : _bool_ - Whether to expect (and therefore properly handle) periodicity in the coordinate corresponding to the azimuthal angle $\phi$. Should typically be `True` for event-level applications but can be set to `False` (which is slightly faster) for jet applications where all $\phi$ differences are less than or equal to $\pi$. - **phi_col** : _int_ - The index of the column of $\phi$ values in the event array. - **n_jobs** : _int_ or `None` - The number of worker processes to use. A value of `None` will use as many processes as there are CPUs on the machine. Note that for smaller numbers of events, a smaller value of `n_jobs` can be faster. - **verbose** : _int_ - Controls the verbosity level. A value greater than `0` will print the progress of the computation at intervals specified by `print_every`. - **print_every** : _int_ - The number of computations to do in between printing the progress. Even if the verbosity level is zero, this still plays a role in determining when the worker processes report the results back to the main process. **Returns** - _numpy.ndarray_ - The EMD values as a two-dimensional array. If `X1` was `None`, then the shape will be `(len(X0), len(X0))` and the array will be symmetric, otherwise it will have shape `(len(X0), len(X1))`. """ _check_params(norm, gdim, phi_col) # determine if we're doing symmetric pairs sym = X1 is None # period handling phi_col_m1 = phi_col - 1 # process events into convenient form for EMD X0 = [_process_for_emd(x, norm, gdim, periodic_phi, phi_col_m1) for x in X0] X1 = X0 if sym else [_process_for_emd(x, norm, gdim, periodic_phi, phi_col_m1) for x in X1] # get iterator for indices pairs = (itertools.combinations(range(len(X0)), r=2) if sym else itertools.product(range(len(X0)), range(len(X1)))) npairs = len(X0)*(len(X0)-1)//2 if sym else len(X0)*len(X1) # handle kwarg options if isinstance(print_every, float): print_every = int(npairs*print_event) if n_jobs is None: n_jobs = multiprocessing.cpu_count() or 1 # setup container for EMDs emds = np.zeros((len(X0), len(X1))) # use some number of worker processes to calculate EMDs start = time.time() if n_jobs != 1: # verbose printing if verbose >= 1: print('Using {} worker process{}:'.format(n_jobs, 'es' if n_jobs > 1 else '')) # create process pool with create_pool(n_jobs) as pool: # iterate over pairs of events begin = end = 0 other_params = [X0, X1, R, norm, n_iter_max, periodic_phi, phi_col_m1] imap_args = ([pair, other_params] for pair in pairs) while end < npairs: end += print_every end = min(end, npairs) chunksize = max(1, (end - begin)//n_jobs) # only hold this many pairs in memory local_imap_args = [next(imap_args) for i in range(end - begin)] # map function and store results results = list(pool.map(_emd4imap, local_imap_args, chunksize=chunksize)) for arg,r in zip(local_imap_args, results): i, j = arg[0] emds[i, j] = r # setup for next iteration of while loop begin = end # print update if verbose if verbose >= 1: args = (end, end/npairs*100, time.time() - start) print(' Computed {} EMDs, {:.2f}% done in {:.2f}s'.format(*args)) # run EMDs in this process elif n_jobs == 1: for k,(i,j) in enumerate(pairs): emds[i, j] = _emd(X0[i], X1[j], R, norm, n_iter_max, periodic_phi, phi_col_m1) if verbose >= 1 and (k % print_every) == 0 and k != 0: args = (k, k/npairs*100, time.time() - start) print('Computed {} EMDs, {:.2f}% done in {:.2f}s'.format(*args)) # unrecognized n_jobs value else: raise ValueError('n_jobs must be a positive integer or -1') # if doing an array with itself, symmetrize the distance matrix if sym: emds += emds.T if verbose >= 1: print() return emds
def emds_pot(X0, X1=None, R=1.0, norm=False, beta=1.0, measure='euclidean', coords='hadronic', gdim=None, mask=False, n_iter_max=100000, periodic_phi=False, phi_col=2, empty_policy='error', n_jobs=None, verbose=0, print_every=10**6): r"""Compute the EMDs between collections of events. This can be used to compute EMDs between all pairs of events in a set or between events in two different sets. **Arguments** - **X0** : _list_ - Iterable collection of events. Each event is assumed to be an `(M,1+gdim)` array of particles, where `M` is the multiplicity and `gdim` is the dimension of the ground space in which to compute euclidean distances between particles (specified by the `gdim` keyword argument). The zeroth column is assumed to be the energies (or equivalently, the transverse momenta) of the particles. For typical hadron collider jet applications, each particle will be of the form `(pT,y,phi)` where `y` is the rapidity and `phi` is the azimuthal angle. - **X1** : _list_ or `None` - Iterable collection of events in the same format as `X0`, or `None`. If the latter, the pairwise distances between events in `X0` will be computed and the returned matrix will be symmetric. - **R** : _float_ - The R parameter in the EMD definition that controls the relative importance of the two terms. Must be greater than or equal to half of the maximum ground distance in the space in order for the EMD to be a valid metric satisfying the triangle inequality. - **norm** : _bool_ - Whether or not to normalize the pT values of the events prior to computing the EMD. - **beta** : _float_ - The angular weighting exponent. The internal pairwsie distance matrix is raised to this power prior to solving the optimal transport problem. - **measure** : _str_ - Controls which metric is used to calculate the ground distances between particles. `'euclidean'` uses the euclidean metric in however many dimensions are provided and specified by `gdim`. `'spherical'` uses the opening angle between particles on the sphere (note that this is not fully tested and should be used cautiously). - **coords** : _str_ - Only has an effect if `measure='spherical'`, in which case it controls if `'hadronic'` coordinates `(pT,y,phi,[m])` are expected versus `'cartesian'` coordinates `(E,px,py,pz)`. - **gdim** : _int_ - The dimension of the ground metric space. Useful for restricting which dimensions are considered part of the ground space. Can be larger than the number of dimensions present in the events (in which case all dimensions will be included). If `None`, has no effect. - **mask** : _bool_ - If `True`, ignores particles farther than `R` away from the origin. - **n_iter_max** : _int_ - Maximum number of iterations for solving the optimal transport problem. - **periodic_phi** : _bool_ - Whether to expect (and therefore properly handle) periodicity in the coordinate corresponding to the azimuthal angle $\phi$. Should typically be `True` for event-level applications but can be set to `False` (which is slightly faster) for jet applications where all $\phi$ differences are less than or equal to $\pi$. - **phi_col** : _int_ - The index of the column of $\phi$ values in the event array. - **empty_policy** : _float_ or `'error'` - Controls behavior if an empty event is passed in. When set to `'error'`, a `ValueError` is raised if an empty event is encountered. If set to a float, that value is returned is returned instead on an empty event. - **n_jobs** : _int_ or `None` - The number of worker processes to use. A value of `None` will use as many processes as there are CPUs on the machine. Note that for smaller numbers of events, a smaller value of `n_jobs` can be faster. - **verbose** : _int_ - Controls the verbosity level. A value greater than `0` will print the progress of the computation at intervals specified by `print_every`. - **print_every** : _int_ - The number of computations to do in between printing the progress. Even if the verbosity level is zero, this still plays a role in determining when the worker processes report the results back to the main process. **Returns** - _numpy.ndarray_ - The EMD values as a two-dimensional array. If `X1` was `None`, then the shape will be `(len(X0), len(X0))` and the array will be symmetric, otherwise it will have shape `(len(X0), len(X1))`. """ _check_params(norm, gdim, phi_col, measure, coords, empty_policy) euclidean = (measure == 'euclidean') hadr2cart = (not euclidean) and (coords == 'hadronic') error_on_empty = (empty_policy == 'error') # determine if we're doing symmetric pairs sym = X1 is None # period handling phi_col_m1 = phi_col - 1 # process events into convenient form for EMD global _X0, _X1 start = time.time() args = (norm, gdim, periodic_phi, phi_col_m1, mask, R, hadr2cart, euclidean, error_on_empty) _X0 = [_process_for_emd(x, *args) for x in X0] _X1 = _X0 if sym else [_process_for_emd(x, *args) for x in X1] # begin printing if verbose >= 1: n = len(_X0) if sym else len(_X0) + len(_X1) s = 'symmetric' if sym else 'asymmetric' t = time.time() - start print('Processed {} events for {} EMD computation in {:.3f}s'.format(n, s, t)) # get iterator for indices pairs = (itertools.combinations(range(len(_X0)), r=2) if sym else itertools.product(range(len(_X0)), range(len(_X1)))) npairs = len(_X0)*(len(_X0)-1)//2 if sym else len(_X0)*len(_X1) # handle kwarg options if isinstance(print_every, float): print_every = int(npairs*print_event) if n_jobs is None or n_jobs == -1: n_jobs = multiprocessing.cpu_count() or 1 # setup container for EMDs emds = np.zeros((len(_X0), len(_X1))) # use some number of worker processes to calculate EMDs start = time.time() no_norm = not norm if n_jobs != 1: # verbose printing if verbose >= 1: print('Using {} worker process{}:'.format(n_jobs, 'es' if n_jobs > 1 else '')) # create process pool with create_pool(n_jobs) as pool: params = (R, no_norm, beta, euclidean, n_iter_max, periodic_phi, phi_col_m1, empty_policy) map_args = ((pair, params) for pair in pairs) # iterate over pairs of events begin = end = 0 while end < npairs: end += print_every end = min(end, npairs) chunksize, extra = divmod(end - begin, n_jobs * 2) if extra: chunksize += 1 # only hold this many pairs in memory local_map_args = [next(map_args) for i in range(end - begin)] # map function and store results results = pool.map(_emd4map, local_map_args, chunksize=chunksize) for arg,r in zip(local_map_args, results): i, j = arg[0] emds[i, j] = r # setup for next iteration of while loop begin = end # print update if verbose if verbose >= 1: args = (end, end/npairs*100, time.time() - start) print(' Computed {} EMDs, {:.2f}% done in {:.2f}s'.format(*args)) # run EMDs in this process elif n_jobs == 1: for k,(i,j) in enumerate(pairs): emds[i, j] = _emd(_X0[i], _X1[j], R, no_norm, beta, euclidean, n_iter_max, periodic_phi, phi_col_m1, empty_policy) if verbose >= 1 and ((k+1) % print_every) == 0: args = (k+1, (k+1)/npairs*100, time.time() - start) print(' Computed {} EMDs, {:.2f}% done in {:.2f}s'.format(*args)) # unrecognized n_jobs value else: raise ValueError('n_jobs must be a positive integer or -1') # delete global arrays del _X0, _X1 # if doing an array with itself, symmetrize the distance matrix if sym: emds += emds.T return emds