def sel(self, *args, **kwargs): """Computes a boolean mask of EFPs matching each of the specifications provided by the `args`. **Arguments** - ***args** : arbitrary positional arguments - Each argument can be either a string or a length-two iterable. If the argument is a string, it should consist of three parts: a character which is a valid element of `cols`, a comparison operator (one of `<`, `>`, `<=`, `>=`, `==`, `!=`), and a number. Whitespace between the parts does not matter. If the argument is a tuple, the first element should be a string containing a column header character and a comparison operator; the second element is the value to be compared. The tuple version is useful when the value is a variable that changes (such as in a list comprehension). **Returns** - _1-d numpy.ndarray_ - A boolean array of length the number of EFPs stored by this object. """ # ensure only valid keyword args are passed specs = kwargs.pop('specs', None) kwargs_check('sel', kwargs) # use default specs if non provided if specs is None: specs = self.specs # iterate through arguments mask = np.ones(len(specs), dtype=bool) for arg in args: # parse arg if isinstance(arg, six.string_types): s = arg elif hasattr(arg, '__getitem__'): if len(arg) == 2: s = arg[0] + str(arg[1]) else: raise ValueError('{} is not length 2'.format(arg)) else: raise TypeError('invalid argument {}'.format(arg)) s = s.replace(' ', '') # match string to pattern match = self._sel_re.match(s) if match is None: raise ValueError('could not understand \'{}\''.format(arg)) # get the variable of the selection var = match.group(1) if var not in self.cols: raise ValueError('\'{}\' not in {}'.format(var, self.cols)) # get the comparison and value comp, val = match.group(2, 3) # AND the selection with mask mask &= explicit_comp(specs[:, getattr(self, var + '_ind')], comp, int(val)) return mask
def emd_wasserstein(ev0, ev1, dists=None, R=1.0, beta=1.0, norm=False, gdim=2, mask=False, return_flow=False, do_timing=False, n_iter_max=100000, epsilon_large_factor=10000.0, epsilon_small_factor=1.0, **kwargs): r"""Compute the EMD between two events using the Wasserstein library. **Arguments** - **ev0** : _numpy.ndarray_ - The first event, given as a two-dimensional array. The event is assumed to be an `(M,1+gdim)` array of particles, where `M` is the multiplicity and `gdim` is the dimension of the ground space in which to compute euclidean distances between particles (as specified by the `gdim` keyword argument). The zeroth column is the weights of the particles, typically their energies or transverse momenta. For typical hadron collider jet applications, each particle will be of the form `(pT,y,phi)` where `y` is the rapidity and `phi` is the azimuthal angle. If `dists` are provided, then the columns after the zeroth are ignored; alternatively a one-dimensional array consisting of just the particle weights may be passed in this case. - **ev1** : _numpy.ndarray_ - The other event, same format as `ev0`. - **dists** : _numpy.ndarray_ - A distance matrix between particles in `ev0` and `ev1`. If `None`, then the columns of the events after the zeroth are taken to be coordinates and the `gdim`-dimensional Euclidean distance is used. - **R** : _float_ - The R parameter in the EMD definition that controls the relative importance of the two terms. Must be greater than or equal to half of the maximum ground distance in the space in order for the EMD to be a valid metric satisfying the triangle inequality. - **beta** : _float_ - The angular weighting exponent. The internal pairwsie distance matrix is raised to this power prior to solving the optimal transport problem. - **norm** : _bool_ - Whether or not to normalize the particle weights to sum to one prior to computing the EMD. - **gdim** : _int_ - The dimension of the ground metric space. Useful for restricting which dimensions are considered part of the ground space when using the internal euclidean distances between particles. Has no effect if `dists` are provided. - **return_flow** : _bool_ - Whether or not to return the flow matrix describing the optimal transport found during the computation of the EMD. Note that since the second term in Eq. 1 is implemented by including an additional particle in the event with lesser total weight, this will be reflected in the flow matrix. - **mask** : _bool_ - If `True`, masks out particles farther than `R` away from the origin. Has no effect if `dists` are provided. - **n_iter_max** : _int_ - Maximum number of iterations for solving the optimal transport problem. - **epsilon_large_factor** : _float_ - Controls some tolerances in the optimal transport solver. This value is multiplied by the floating points epsilon (around 1e-16 for 64-bit floats) to determine the actual tolerance. - **epsilon_small_factor** : _float_ - Analogous to `epsilon_large_factor` but used where the numerical tolerance can be stricter. **Returns** - _float_ - The EMD value. - [_numpy.ndarray_], optional - The flow matrix found while solving for the EMD. The `(i,j)`th entry is the amount of `pT` that flows between particle i in `ev0` and particle j in `ev1`. """ # warn about old kwargs old_kwargs = {'measure', 'coords', 'periodic_phi', 'phi_col', 'empty_policy'} kwargs_check('emd_wasserstein', kwargs, old_kwargs) for k in kwargs: warnings.warn("Keyword argument '{}' has no effect on `emd_wasserstein`.".format(k) + " Use `emd_pot` if you need previous functionality.") # set options _EMD.set_R(R) _EMD.set_beta(beta) _EMD.set_norm(norm) _EMD.set_network_simplex_params(n_iter_max, epsilon_large_factor, epsilon_small_factor) # run using euclidean distances if dists is None: ev0, ev1 = np.atleast_2d(ev0)[:,:gdim+1], np.atleast_2d(ev1)[:,:gdim+1] # mask out particles if mask: R2 = R*R ev0, ev1 = ev0[np.sum(ev0**2, axis=1) <= R2], ev1[np.sum(ev1**2, axis=1) <= R2] # evaluate EMD emd = _EMD(ev0[:,0], ev0[:,1:], ev1[:,0], ev1[:,1:]) # run using custom distances else: # if events are 2d, extract weights as just the first column if ev0.ndim == 2: ev0 = ev0[:,0] if ev1.ndim == 2: ev1 = ev1[:,0] # evaluate EMD emd = _EMD(ev0, ev1, dists) # get flows if requested if return_flow: flows = _EMD.flows() if return_flow: return emd, flows else: return emd
def emds_wasserstein(events0, events1=None, R=1.0, beta=1.0, norm=False, gdim=2, mask=False, external_emd_handler=None, n_jobs=-1, print_every=0, verbose=0, throw_on_error=True, n_iter_max=100000, epsilon_large_factor=10000.0, epsilon_small_factor=1.0, **kwargs): r"""Compute the EMDs between collections of events. This can be used to compute EMDs between all pairs of events in a set or between events in two different sets. **Arguments** - **events0** : _list_ - Iterable collection of events. Each event is assumed to be an `(M,1+gdim)` array of particles, where `M` is the multiplicity and `gdim` is the dimension of the ground space in which to compute euclidean distances between particles (as specified by the `gdim` keyword argument). The zeroth column is the weights of the particles, typically their energies or transverse momenta. For typical hadron collider jet applications, each particle will be of the form `(pT,y,phi)` where `y` is the rapidity and `phi` is the azimuthal angle. If `dists` are provided, then the columns after the zeroth are ignored; alternatively a one-dimensional array consisting of just the particle weights may be passed in this case. - **events1** : _list_ or `None` - Iterable collection of events in the same format as `events0`, or `None`. If the latter, the pairwise distances between events in `events0` will be computed and the returned matrix will be symmetric. - **R** : _float_ - The R parameter in the EMD definition that controls the relative importance of the two terms. Must be greater than or equal to half of the maximum ground distance in the space in order for the EMD to be a valid metric satisfying the triangle inequality. - **norm** : _bool_ - Whether or not to normalize the particle weights to sum to one prior to computing the EMD. - **beta** : _float_ - The angular weighting exponent. The internal pairwsie distance matrix is raised to this power prior to solving the optimal transport problem. - **gdim** : _int_ - The dimension of the ground metric space. Useful for restricting which dimensions are considered part of the ground space when using the internal euclidean distances between particles. - **mask** : _bool_ - If `True`, ignores particles farther than `R` away from the origin. - **external_emd_handler** : _wasserstein.ExternalEMDHandler_ - An instance of an external EMD handler from the wasserstein module, e.g. `CorrelationDimension`. - **n_jobs** : _int_ or `None` - The number of cpu cores to use. A value of `None` or `-1` will use as many threads as there are CPUs on the machine. - **print_every** : _int_ - The number of computations to do in between printing the progress. Even if the verbosity level is zero, this still plays a role in determining when the worker threads report the results back to the main thread and check for interrupt signals. - **verbose** : _int_ - Controls the verbosity level. A value greater than `0` will print the progress of the computation at intervals specified by `print_every`. - **throw_on_error** : _bool_ - Whether or not to raise an exception when an issue is encountered. Can be useful when debugging. - **n_iter_max** : _int_ - Maximum number of iterations for solving the optimal transport problem. - **epsilon_large_factor** : _float_ - Controls some tolerances in the optimal transport solver. This value is multiplied by the floating points epsilon (around 1e-16 for 64-bit floats) to determine the actual tolerance. - **epsilon_small_factor** : _float_ - Analogous to `epsilon_large_factor` but used where the numerical tolerance can be stricter. **Returns** - _numpy.ndarray_ - The EMD values as a two-dimensional array, except if an external EMD handler was provided, in which case no value is returned. If `events1` was `None`, then the shape will be `(len(events0), len(events0))` and the array will be symmetric, otherwise it will have shape `(len(events0), len(events1))`. """ # warn about old kwargs old_kwargs = {'X0', 'X1', 'measure', 'coords', 'periodic_phi', 'phi_col', 'empty_policy'} kwargs_check('emds_wasserstein', kwargs, old_kwargs) for k in kwargs: warnings.warn("Keyword argument '{}' has no effect on `emds_wasserstein`.".format(k) + " Use `emds_pot` if you need previous functionality.") # determine number of threads to use if n_jobs is None or n_jobs == -1: n_jobs = multiprocessing.cpu_count() or 1 # create object pairwise_emd = wasserstein.PairwiseEMD(R, beta, norm, n_jobs, print_every, bool(verbose), throw_on_error=throw_on_error, n_iter_max=n_iter_max, epsilon_large_factor=epsilon_large_factor, epsilon_small_factor=epsilon_small_factor) if verbose > 0: print(pairwise_emd) # set handler if given if external_emd_handler is not None: pairwise_emd.set_external_emd_handler(external_emd_handler) # run computation pairwise_emd(events0, events1, gdim, mask) # return flows if handler not provided if external_emd_handler is None: return pairwise_emd.emds()
def __init__(self, kwargs): kwargs_check('EFBase', kwargs, allowed=MEASURE_KWARGS) self._measure = Measure(kwargs.pop('measure'), **kwargs)