def replace_with_closer(st, mobile_atom, before_site, start_frame, after_site, end_frame): if before_site == SiteTrajectory.SITE_UNKNOWN or \ after_site == SiteTrajectory.SITE_UNKNOWN: return SiteTrajectory.SITE_UNKNOWN if pbcc is None: pbcc = PBCCalculator(st.site_network.structure.cell) n_frames = end_frame - start_frame out = np.empty(shape=n_frames) for i in range(n_frames): ptbuf[0] = st.site_network.centers[before_site] ptbuf[1] = st.site_network.centers[after_site] pbcc.distances(st.real_trajectory[start_frame + i, mobile_atom], ptbuf, in_place=True, out=distbuf) if distbuf[0] < distbuf[1]: out[i] = before_site else: out[i] = after_site return out
def _get_sites_to_merge(self, st): # -- Compute jump statistics if not st.site_network.has_attribute('n_ij'): ja = JumpAnalysis() ja.run(st) pbcc = PBCCalculator(st.site_network.structure.cell) site_centers = st.site_network.centers # -- Build connectivity_matrix connectivity_matrix = self.connectivity_matrix_generator( st.site_network).copy() n_sites_before = st.site_network.n_sites assert n_sites_before == connectivity_matrix.shape[0] centers_before = st.site_network.centers # For diagnostic purposes no_diag_graph = connectivity_matrix.astype(dtype=np.float, copy=True) np.fill_diagonal(no_diag_graph, np.nan) # Rather arbitrary, but this is really just an alarm for if things # are really, really wrong edge_threshold = np.nanmean( no_diag_graph) + 3 * np.nanstd(no_diag_graph) n_alarming_ignored_edges = 0 # Apply distance threshold for i in range(n_sites_before): dists = pbcc.distances(centers_before[i], centers_before[i + 1:]) js_too_far = np.where(dists > self.distance_threshold)[0] js_too_far += i + 1 if np.any(connectivity_matrix[i, js_too_far] > edge_threshold) or \ np.any(connectivity_matrix[js_too_far, i] > edge_threshold): n_alarming_ignored_edges += 1 connectivity_matrix[i, js_too_far] = 0 connectivity_matrix[js_too_far, i] = 0 # Symmetry if n_alarming_ignored_edges > 0: logger.warning( " At least %i site pairs with high (z-score > 3) fluxes were over the given distance cutoff.\n" " This may or may not be a problem; but if `distance_threshold` is low, consider raising it." % n_alarming_ignored_edges) # -- Do Markov Clustering clusters = markov_clustering(connectivity_matrix, **self.markov_parameters) return clusters
def _get_sites_to_merge(self, st, threshold=0): sn = st.site_network attrmat = getattr(sn, self.attrname) assert attrmat.shape == ( sn.n_sites, sn.n_sites ), "`attrname` doesn't seem to indicate an edge property." connmat = self.relation(attrmat, threshold) # Apply distance threshold if self.distance_threshold < np.inf: pbcc = PBCCalculator(sn.structure.cell) centers = sn.centers for i in range(sn.n_sites): dists = pbcc.distances(centers[i], centers[i + 1:]) js_too_far = np.where(dists > self.distance_threshold)[0] js_too_far += i + 1 connmat[i, js_too_far] = False connmat[js_too_far, i] = False # Symmetry if self.forbid_multiple_occupancy: n_mobile = sn.n_mobile for frame in st.traj: frame = [s for s in frame if s >= 0] for site in frame: # only known # can't merge occupied site with other simulatanious occupied sites connmat[site, frame] = False # Everything is always mergable with itself. np.fill_diagonal(connmat, True) # Get mergable groups n_merged_sites, labels = connected_components( connmat, directed=self.directed, connection=self.connection) # MergeSites will check pairwise distances; we just need to make it the # right format. merge_groups = [] for lbl in range(n_merged_sites): merge_groups.append(np.where(labels == lbl)[0]) return merge_groups
class LandmarkAnalysis(object): """Track a mobile species through a fixed lattice using landmark vectors.""" def __init__(self, clustering_algorithm='dotprod', clustering_params={}, cutoff=2.0, minimum_site_occupancy=0.1, peak_evening='none', weighted_site_positions=True, check_for_zero_landmarks=True, static_movement_threshold=1.0, dynamic_lattice_mapping=False, relaxed_lattice_checks=False, max_mobile_per_site=1, force_no_memmap=False, verbose=True): """ :param double cutoff: The distance cutoff for the landmark vectors. (unitless) :param double minimum_site_occupancy = 0.1: Minimum occupancy (% of time occupied) for a site to qualify as such. :param dict clustering_params: Parameters for the chosen clustering_algorithm :param str peak_evening: Whether and what kind of peak "evening" to apply; that is, processing that makes all large peaks in the landmark vector more similar in magnitude. This can help in site clustering. Valid options: 'none', 'clip' :param bool weighted_site_positions: When computing site positions, whether to weight the average by assignment confidence. :param bool check_for_zero_landmarks: Whether to check for and raise exceptions when all-zero landmark vectors are computed. :param float static_movement_threshold: (Angstrom) the maximum allowed distance between an instantanous static atom position and it's ideal position. :param bool dynamic_lattice_mapping: Whether to dynamically decide each frame which static atom represents each average lattice position; this allows the LandmarkAnalysis to deal with, say, a rare exchage of two static atoms that does not change the structure of the lattice. It does NOT allow LandmarkAnalysis to deal with lattices whose structures actually change over the course of the trajectory. In certain cases this is better delt with by MergeSitesByDynamics. :param int max_mobile_per_site: The maximum number of mobile atoms that can be assigned to a single site without throwing an error. Regardless of the value, assignments of more than one mobile atom to a single site will be recorded and reported. Setting this to 2 can be necessary for very diffusive, liquid-like materials at high temperatures. Statistics related to this are reported in self.avg_mobile_per_site and self.n_multiple_assignments. :param bool force_no_memmap: if True, landmark vectors will be stored only in memory. Only useful if access to landmark vectors after the analysis has run is desired. :param bool verbose: If `True`, progress bars and messages will be printed to stdout. """ self._cutoff = cutoff self._minimum_site_occupancy = minimum_site_occupancy self._cluster_algo = clustering_algorithm self._clustering_params = clustering_params if not peak_evening in ['none', 'clip']: raise ValueError("Invalid value `%s` for peak_evening" % peak_evening) self._peak_evening = peak_evening self.verbose = verbose self.check_for_zero_landmarks = check_for_zero_landmarks self.weighted_site_positions = weighted_site_positions self.dynamic_lattice_mapping = dynamic_lattice_mapping self.relaxed_lattice_checks = relaxed_lattice_checks self._landmark_vectors = None self._landmark_dimension = None self.static_movement_threshold = static_movement_threshold self.max_mobile_per_site = max_mobile_per_site self.force_no_memmap = force_no_memmap self._has_run = False @property def cutoff(self): return self._cutoff @analysis_result def landmark_vectors(self): view = self._landmark_vectors[:] view.flags.writeable = False return view @analysis_result def landmark_dimension(self): return self._landmark_dimension def run(self, sn, frames): """Run the landmark analysis. The input SiteNetwork is a network of predicted sites; it's sites will be used as the "basis" for the landmark vectors. Takes a SiteNetwork and returns a SiteTrajectory. """ assert isinstance(sn, SiteNetwork) if self._has_run: raise ValueError("Cannot rerun LandmarkAnalysis!") if frames.shape[1:] != (sn.n_total, 3): raise ValueError("Wrong shape %s for frames." % frames.shape) if sn.vertices is None: raise ValueError("Input SiteNetwork must have vertices") n_frames = len(frames) if self.verbose: print "--- Running Landmark Analysis ---" # Create PBCCalculator self._pbcc = PBCCalculator(sn.structure.cell) # -- Step 1: Compute site-to-vertex distances self._landmark_dimension = sn.n_sites longest_vert_set = np.max([len(v) for v in sn.vertices]) verts_np = np.array( [v + [-1] * (longest_vert_set - len(v)) for v in sn.vertices]) site_vert_dists = np.empty(shape=verts_np.shape, dtype=np.float) site_vert_dists.fill(np.nan) for i, polyhedron in enumerate(sn.vertices): verts_poses = sn.static_structure.get_positions()[polyhedron] dists = self._pbcc.distances(sn.centers[i], verts_poses) site_vert_dists[i, :len(polyhedron)] = dists # -- Step 2: Compute landmark vectors if self.verbose: print " - computing landmark vectors -" # Compute landmark vectors # The dimension of one landmark vector is the number of Voronoi regions shape = (n_frames * sn.n_mobile, self._landmark_dimension) with tempfile.NamedTemporaryFile() as mmap_backing: if self.force_no_memmap: self._landmark_vectors = np.empty(shape=shape, dtype=np.float) else: self._landmark_vectors = np.memmap(mmap_backing.name, mode='w+', dtype=np.float, shape=shape) helpers._fill_landmark_vectors( self, sn, verts_np, site_vert_dists, frames, check_for_zeros=self.check_for_zero_landmarks, tqdm=tqdm) # -- Step 3: Cluster landmark vectors if self.verbose: print " - clustering landmark vectors -" # - Preprocess - self._do_peak_evening() # - Cluster - cluster_func = importlib.import_module( "..cluster." + self._cluster_algo, package=__name__).do_landmark_clustering cluster_counts, lmk_lbls, lmk_confs = \ cluster_func(self._landmark_vectors, clustering_params = self._clustering_params, min_samples = self._minimum_site_occupancy / float(sn.n_mobile), verbose = self.verbose) if self.verbose: print " Failed to assign %i%% of mobile particle positions to sites." % ( 100.0 * np.sum(lmk_lbls < 0) / float(len(lmk_lbls))) # reshape lables and confidences lmk_lbls.shape = (n_frames, sn.n_mobile) lmk_confs.shape = (n_frames, sn.n_mobile) n_sites = len(cluster_counts) if n_sites < sn.n_mobile: raise ValueError( "There are %i mobile particles, but only identified %i sites. Check clustering_params." % (sn.n_mobile, n_sites)) if self.verbose: print " Identified %i sites with assignment counts %s" % ( n_sites, cluster_counts) # Check that multiple particles are never assigned to one site at the # same time, cause that would be wrong. n_more_than_ones = 0 avg_mobile_per_site = 0 divisor = 0 for frame_i, site_frame in enumerate(lmk_lbls): _, counts = np.unique(site_frame[site_frame >= 0], return_counts=True) count_msk = counts > self.max_mobile_per_site if np.any(count_msk): raise ValueError( "%i mobile particles were assigned to only %i site(s) (%s) at frame %i." % (np.sum(counts[count_msk]), np.sum(count_msk), np.where(count_msk)[0], frame_i)) n_more_than_ones += np.sum(counts > 1) avg_mobile_per_site += np.sum(counts) divisor += len(counts) self.n_multiple_assignments = n_more_than_ones self.avg_mobile_per_site = avg_mobile_per_site / float(divisor) # -- Do output # - Compute site centers site_centers = np.empty(shape=(n_sites, 3), dtype=frames.dtype) for site in xrange(n_sites): mask = lmk_lbls == site pts = frames[:, sn.mobile_mask][mask] if self.weighted_site_positions: site_centers[site] = self._pbcc.average( pts, weights=lmk_confs[mask]) else: site_centers[site] = self._pbcc.average(pts) # Build output obejcts out_sn = sn.copy() out_sn.centers = site_centers assert out_sn.vertices is None out_st = SiteTrajectory(out_sn, lmk_lbls, lmk_confs) out_st.set_real_traj(frames) self._has_run = True return out_st # -------- "private" methods -------- def _do_peak_evening(self): if self._peak_evening == 'none': return elif self._peak_evening == 'clip': lvec_peaks = np.max(self._landmark_vectors, axis=1) # Clip all peaks to the lowest "normal" (stdev.) peak lvec_clip = np.mean(lvec_peaks) - np.std(lvec_peaks) # Do the clipping self._landmark_vectors[ self._landmark_vectors > lvec_clip] = lvec_clip
class LandmarkAnalysis(object): """Site analysis of mobile atoms in a static lattice with landmark analysis. :param double cutoff_center: The midpoint for the logistic function used as the landmark cutoff function. (unitless) :param double cutoff_steepness: Steepness of the logistic cutoff function. :param double minimum_site_occupancy = 0.1: Minimum occupancy (% of time occupied) for a site to qualify as such. :param str clustering_algorithm: The landmark clustering algorithm. ``sitator`` supplies two: - ``"dotprod"``: The method described in our "Unsupervised landmark analysis for jump detection in molecular dynamics simulations" paper. - ``"mcl"``: A newer method we are developing. :param dict clustering_params: Parameters for the chosen ``clustering_algorithm``. :param str site_centers_method: The method to use for computing the real space positions of the sites. Options: - ``SITE_CENTERS_REAL_UNWEIGHTED``: A spatial average of all real-space mobile atom positions assigned to the site is taken. - ``SITE_CENTERS_REAL_WEIGHTED``: A spatial average of all real-space mobile atom positions assigned to the site is taken, weighted by the confidences with which they assigned to the site. - ``SITE_CENTERS_REPRESENTATIVE_LANDMARK``: A spatial average over all landmarks' centers is taken, weighted by the representative or "typical" landmark vector at the site. The "real" methods will generally be more faithful to the simulation, but the representative landmark method can work better in cases with short trajectories, producing a more "ideal" site location. :param bool check_for_zero_landmarks: Whether to check for and raise exceptions when all-zero landmark vectors are computed. :param float static_movement_threshold: (Angstrom) the maximum allowed distance between an instantanous static atom position and it's ideal position. :param bool dynamic_lattice_mapping: Whether to dynamically decide each frame which static atom represents each average lattice position; this allows the LandmarkAnalysis to deal with, say, a rare exchage of two static atoms that does not change the structure of the lattice. It does NOT allow LandmarkAnalysis to deal with lattices whose structures actually change over the course of the trajectory. In certain cases this is better delt with by ``MergeSitesByDynamics``. :param int max_mobile_per_site: The maximum number of mobile atoms that can be assigned to a single site without throwing an error. Regardless of the value, assignments of more than one mobile atom to a single site will be recorded and reported. Setting this to 2 can be necessary for very diffusive, liquid-like materials at high temperatures. Statistics related to this are reported in ``self.avg_mobile_per_site`` and ``self.n_multiple_assignments``. :param bool force_no_memmap: if True, landmark vectors will be stored only in memory. Only useful if access to landmark vectors after the analysis has run is desired. :param bool verbose: Verbosity for the ``clustering_algorithm``. Other output controlled through ``logging``. """ SITE_CENTERS_REAL_UNWEIGHTED = 'real-unweighted' SITE_CENTERS_REAL_WEIGHTED = 'real-weighted' SITE_CENTERS_REPRESENTATIVE_LANDMARK = 'representative-landmark' CLUSTERING_CLUSTER_SIZE = 'cluster-size' CLUSTERING_LABELS = 'cluster-labels' CLUSTERING_CONFIDENCES = 'cluster-confs' CLUSTERING_LANDMARK_GROUPINGS = 'cluster-landmark-groupings' CLUSTERING_REPRESENTATIVE_LANDMARKS = 'cluster-representative-lvecs' def __init__(self, clustering_algorithm='dotprod', clustering_params={}, cutoff_midpoint=1.5, cutoff_steepness=30, minimum_site_occupancy=0.01, site_centers_method=SITE_CENTERS_REAL_WEIGHTED, check_for_zero_landmarks=True, static_movement_threshold=1.0, dynamic_lattice_mapping=False, relaxed_lattice_checks=False, max_mobile_per_site=1, force_no_memmap=False, verbose=True): self._cutoff_midpoint = cutoff_midpoint self._cutoff_steepness = cutoff_steepness self._minimum_site_occupancy = minimum_site_occupancy self._cluster_algo = clustering_algorithm self._clustering_params = clustering_params self.verbose = verbose self.check_for_zero_landmarks = check_for_zero_landmarks self.site_centers_method = site_centers_method self.dynamic_lattice_mapping = dynamic_lattice_mapping self.relaxed_lattice_checks = relaxed_lattice_checks self._landmark_vectors = None self._landmark_dimension = None self.static_movement_threshold = static_movement_threshold self.max_mobile_per_site = max_mobile_per_site self.force_no_memmap = force_no_memmap self._has_run = False @property def cutoff(self): return self._cutoff @analysis_result def landmark_vectors(self): """Landmark vectors from the last invocation of ``run()``""" view = self._landmark_vectors[:] view.flags.writeable = False return view @analysis_result def landmark_dimension(self): """Number of components in a single landmark vector.""" return self._landmark_dimension def run(self, sn, frames): """Run the landmark analysis. The input ``SiteNetwork`` is a network of predicted sites; it's sites will be used as the "basis" for the landmark vectors. Wraps a copy of ``frames`` into the unit cell. Args: sn (SiteNetwork): The landmark basis. Each site is a landmark defined by its vertex static atoms, as indicated by `sn.vertices`. (Typically from ``VoronoiSiteGenerator``.) frames (ndarray n_frames x n_atoms x 3): A trajectory. Can be unwrapped; a copy will be wrapped before the analysis. """ assert isinstance(sn, SiteNetwork) if self._has_run: raise ValueError("Cannot rerun LandmarkAnalysis!") if frames.shape[1:] != (sn.n_total, 3): raise ValueError("Wrong shape %s for frames." % (frames.shape, )) if sn.vertices is None: raise ValueError("Input SiteNetwork must have vertices") n_frames = len(frames) logger.info("--- Running Landmark Analysis ---") # Create PBCCalculator self._pbcc = PBCCalculator(sn.structure.cell) # -- Step 0: Wrap to Unit Cell orig_frames = frames # Keep a reference around frames = frames.copy() # Flatten to list of points for wrapping orig_frame_shape = frames.shape frames.shape = (orig_frame_shape[0] * orig_frame_shape[1], 3) self._pbcc.wrap_points(frames) # Back to list of frames frames.shape = orig_frame_shape # -- Step 1: Compute site-to-vertex distances self._landmark_dimension = sn.n_sites longest_vert_set = np.max([len(v) for v in sn.vertices]) verts_np = np.array([ np.concatenate((v, [-1] * (longest_vert_set - len(v)))) for v in sn.vertices ], dtype=np.int) site_vert_dists = np.empty(shape=verts_np.shape, dtype=np.float) site_vert_dists.fill(np.nan) for i, polyhedron in enumerate(sn.vertices): verts_poses = sn.static_structure.get_positions()[polyhedron] dists = self._pbcc.distances(sn.centers[i], verts_poses) site_vert_dists[i, :len(polyhedron)] = dists # -- Step 2: Compute landmark vectors logger.info(" - computing landmark vectors -") # Compute landmark vectors # The dimension of one landmark vector is the number of Voronoi regions shape = (n_frames * sn.n_mobile, self._landmark_dimension) with tempfile.NamedTemporaryFile() as mmap_backing: if self.force_no_memmap: self._landmark_vectors = np.empty(shape=shape, dtype=np.float) else: self._landmark_vectors = np.memmap(mmap_backing.name, mode='w+', dtype=np.float, shape=shape) helpers._fill_landmark_vectors( self, sn, verts_np, site_vert_dists, frames, check_for_zeros=self.check_for_zero_landmarks, tqdm=tqdm, logger=logger) if not self.check_for_zero_landmarks and self.n_all_zero_lvecs > 0: logger.warning( " Had %i all-zero landmark vectors; no error because `check_for_zero_landmarks = False`." % self.n_all_zero_lvecs) elif self.check_for_zero_landmarks: assert self.n_all_zero_lvecs == 0 # -- Step 3: Cluster landmark vectors logger.info(" - clustering landmark vectors -") # - Cluster - # FIXME: remove reload after development done clustermod = importlib.import_module("..cluster." + self._cluster_algo, package=__name__) importlib.reload(clustermod) cluster_func = clustermod.do_landmark_clustering clustering = \ cluster_func(self._landmark_vectors, clustering_params = self._clustering_params, min_samples = self._minimum_site_occupancy / float(sn.n_mobile), verbose = self.verbose) cluster_counts = clustering[LandmarkAnalysis.CLUSTERING_CLUSTER_SIZE] lmk_lbls = clustering[LandmarkAnalysis.CLUSTERING_LABELS] lmk_confs = clustering[LandmarkAnalysis.CLUSTERING_CONFIDENCES] if LandmarkAnalysis.CLUSTERING_LANDMARK_GROUPINGS in clustering: landmark_clusters = clustering[ LandmarkAnalysis.CLUSTERING_LANDMARK_GROUPINGS] assert len(cluster_counts) == len(landmark_clusters) else: landmark_clusters = None if LandmarkAnalysis.CLUSTERING_REPRESENTATIVE_LANDMARKS in clustering: rep_lvecs = np.asarray(clustering[ LandmarkAnalysis.CLUSTERING_REPRESENTATIVE_LANDMARKS]) assert rep_lvecs.shape == (len(cluster_counts), self._landmark_vectors.shape[1]) else: rep_lvecs = None logging.info( " Failed to assign %i%% of mobile particle positions to sites." % (100.0 * np.sum(lmk_lbls < 0) / float(len(lmk_lbls)))) # reshape lables and confidences lmk_lbls.shape = (n_frames, sn.n_mobile) lmk_confs.shape = (n_frames, sn.n_mobile) n_sites = len(cluster_counts) if n_sites < (sn.n_mobile / self.max_mobile_per_site): raise InsufficientSitesError(verb="Landmark analysis", n_sites=n_sites, n_mobile=sn.n_mobile) logging.info(" Identified %i sites with assignment counts %s" % (n_sites, cluster_counts)) # -- Do output out_sn = sn.copy() # - Compute site centers site_centers = np.empty(shape=(n_sites, 3), dtype=frames.dtype) if self.site_centers_method == LandmarkAnalysis.SITE_CENTERS_REAL_WEIGHTED or \ self.site_centers_method == LandmarkAnalysis.SITE_CENTERS_REAL_UNWEIGHTED: for site in range(n_sites): mask = lmk_lbls == site pts = frames[:, sn.mobile_mask][mask] if self.site_centers_method == LandmarkAnalysis.SITE_CENTERS_REAL_WEIGHTED: site_centers[site] = self._pbcc.average( pts, weights=lmk_confs[mask]) else: site_centers[site] = self._pbcc.average(pts) elif self.site_centers_method == LandmarkAnalysis.SITE_CENTERS_REPRESENTATIVE_LANDMARK: if rep_lvecs is None: raise ValueError( "Chosen clustering method (with current parameters) didn't return representative landmark vectors; can't use SITE_CENTERS_REPRESENTATIVE_LANDMARK." ) for site in range(n_sites): weights_nonzero = rep_lvecs[site] > 0 site_centers[site] = self._pbcc.average( sn.centers[weights_nonzero], weights=rep_lvecs[site, weights_nonzero]) else: raise ValueError("Invalid site centers method '%s'" % self.site_centers_method) out_sn.centers = site_centers # - If clustering gave us that, compute site vertices if landmark_clusters is not None: vertices = [] for lclust in landmark_clusters: vertices.append( set.union(*[set(sn.vertices[l]) for l in lclust])) out_sn.vertices = vertices out_st = SiteTrajectory(out_sn, lmk_lbls, lmk_confs) # Check that multiple particles are never assigned to one site at the # same time, cause that would be wrong. self.n_multiple_assignments, self.avg_mobile_per_site = out_st.check_multiple_occupancy( max_mobile_per_site=self.max_mobile_per_site) out_st.set_real_traj(orig_frames) self._has_run = True return out_st
def run(self, st): """Takes a SiteTrajectory and returns a SiteTrajectory, including a new SiteNetwork.""" if self.check_types and st.site_network.site_types is None: raise ValueError( "Cannot run a check_types=True MergeSitesByDynamics on a SiteTrajectory without type information." ) # Compute jump statistics if not st.site_network.has_attribute('p_ij'): ja = JumpAnalysis(verbose=self.verbose) ja.run(st) pbcc = PBCCalculator(st.site_network.structure.cell) site_centers = st.site_network.centers if self.check_types: site_types = st.site_network.site_types connectivity_matrix = st.site_network.p_ij assert st.site_network.n_sites == connectivity_matrix.shape[0] clusters = self._markov_clustering(connectivity_matrix, **self.markov_parameters) new_n_sites = len(clusters) if self.verbose: print "After merge there will be %i sites" % new_n_sites if self.check_types: new_types = np.empty(shape=new_n_sites, dtype=np.int) new_centers = np.empty(shape=(new_n_sites, 3), dtype=st.site_network.centers.dtype) translation = np.empty(shape=st.site_network.n_sites, dtype=np.int) translation.fill(-1) for newsite in xrange(new_n_sites): mask = list(clusters[newsite]) # Update translation table if np.any(translation[mask] != -1): # We've assigned a different cluster for this before... weird # degeneracy raise ValueError( "Markov clustering tried to merge site(s) into more than one new site" ) translation[mask] = newsite to_merge = site_centers[mask] # Check distances dists = pbcc.distances(to_merge[0], to_merge[1:]) assert np.all( dists < self.distance_threshold ), "Markov clustering tried to merge sites more than %f apart -- this may be valid, and the distance threshold may need to be increased." % self.distance_threshold # New site center new_centers[newsite] = pbcc.average(to_merge) if self.check_types: assert np.all(site_types[mask] == site_types[mask][0]) new_types[newsite] = site_types[mask][0] newsn = st.site_network.copy() newsn.centers = new_centers if self.check_types: newsn.site_types = new_types newtraj = translation[st._traj] newtraj[st._traj == SiteTrajectory.SITE_UNKNOWN] = SiteTrajectory.SITE_UNKNOWN # It doesn't make sense to propagate confidence information through a # transform that might completely invalidate it newst = SiteTrajectory(newsn, newtraj, confidences=None) if not st.real_trajectory is None: newst.set_real_traj(st.real_trajectory) return newst
def run(self, st, **kwargs): """Takes a ``SiteTrajectory`` and returns a new ``SiteTrajectory``.""" if self.check_types and st.site_network.site_types is None: raise ValueError( "Cannot run a check_types=True MergeSites on a SiteTrajectory without type information." ) # -- Compute jump statistics pbcc = PBCCalculator(st.site_network.structure.cell) site_centers = st.site_network.centers if self.check_types: site_types = st.site_network.site_types clusters = self._get_sites_to_merge(st, **kwargs) old_n_sites = st.site_network.n_sites new_n_sites = len(clusters) logger.info( "After merging %i sites there will be %i sites for %i mobile particles" % (len(site_centers), new_n_sites, st.site_network.n_mobile)) if new_n_sites < st.site_network.n_mobile: raise InsufficientSitesError(verb="Merging", n_sites=new_n_sites, n_mobile=st.site_network.n_mobile) if self.check_types: new_types = np.empty(shape=new_n_sites, dtype=np.int) merge_verts = st.site_network.vertices is not None if merge_verts: new_verts = [] # -- Merge Sites new_centers = np.empty(shape=(new_n_sites, 3), dtype=st.site_network.centers.dtype) translation = np.empty(shape=st.site_network.n_sites, dtype=np.int) translation.fill(-1) for newsite in range(new_n_sites): mask = list(clusters[newsite]) # Update translation table if np.any(translation[mask] != -1): # We've assigned a different cluster for this before... weird # degeneracy raise ValueError( "Site merging tried to merge site(s) into more than one new site. This shouldn't happen." ) translation[mask] = newsite to_merge = site_centers[mask] # Check distances if not self.maximum_merge_distance is None: dists = pbcc.distances(to_merge[0], to_merge[1:]) if not np.all(dists <= self.maximum_merge_distance): raise MergedSitesTooDistantError( "Markov clustering tried to merge sites more than %.2f apart. Lower your distance_threshold?" % self.maximum_merge_distance) # New site center if self.weighted_spatial_average: new_centers[newsite] = pbcc.average(to_merge) else: occs = st.site_network.occupancies[mask] new_centers[newsite] = pbcc.average(to_merge, weights=occs) if self.check_types: assert np.all(site_types[mask] == site_types[mask][0]) new_types[newsite] = site_types[mask][0] if merge_verts: new_verts.append( set.union( *[set(st.site_network.vertices[i]) for i in mask])) newsn = st.site_network.copy() newsn.centers = new_centers if self.check_types: newsn.site_types = new_types if merge_verts: newsn.vertices = new_verts newtraj = translation[st._traj] newtraj[st._traj == SiteTrajectory.SITE_UNKNOWN] = SiteTrajectory.SITE_UNKNOWN # It doesn't make sense to propagate confidence information through a # transform that might completely invalidate it newst = SiteTrajectory(newsn, newtraj, confidences=None) if not st.real_trajectory is None: newst.set_real_traj(st.real_trajectory) if self.set_merged_into: if st.site_network.has_attribute("merged_into"): st.site_network.remove_attribute("merged_into") st.site_network.add_site_attribute("merged_into", translation) return newst