def load_trjs_or_features(args): if args.features: with timed("Loading features took %.1f s.", logger.info): lengths, data = load_features(args.features, stride=args.subsample) else: assert args.trajectories assert len(args.trajectories) == len(args.topologies) targets = { os.path.basename(topf): "%s files" % len(trjfs) for topf, trjfs in zip(args.topologies, args.trajectories) } logger.info("Beginning clustering; targets:\n%s", json.dumps(targets, indent=4)) with timed("Loading trajectories took %.1f s.", logger.info): lengths, xyz, select_top = load_trajectories( args.topologies, args.trajectories, selections=args.atoms, stride=args.subsample, processes=auto_nprocs()) logger.info("Clustering using %s atoms matching '%s'.", xyz.shape[1], args.atoms) # md.rmsd requires an md.Trajectory object, so wrap `xyz` in # the topology. data = md.Trajectory(xyz=xyz, topology=select_top) return lengths, data
def main(argv=None): args = process_command_line(argv) # note that in MPI mode, lengths will be global, whereas data will # be local (i.e. only this node's data). lengths, data = load_trjs_or_features(args) kwargs = {} if args.cluster_iterations is not None: kwargs['kmedoids_updates'] = int(args.cluster_iterations) clustering = args.Clusterer(metric=args.cluster_distance, n_clusters=args.cluster_number, cluster_radius=args.cluster_radius, mpi_mode=mpi_mode, **kwargs) clustering.fit(data) # release the RAM held by the trajectories (we don't need it anymore) del data logger.info("Clustered %s frames into %s clusters in %s seconds.", sum(lengths), len(clustering.centers_), clustering.runtime_) result = clustering.result_ if mpi_mode: local_ctr_inds, local_dists, local_assigs = \ result.center_indices, result.distances, result.assignments with timed("Reassembled dist and assign arrays in %.2f sec", logging.info): all_dists = mpi.ops.assemble_striped_ragged_array( local_dists, lengths) all_assigs = mpi.ops.assemble_striped_ragged_array( local_assigs, lengths) ctr_inds = mpi.ops.convert_local_indices(local_ctr_inds, lengths) result = ClusterResult(center_indices=ctr_inds, distances=all_dists, assignments=all_assigs, centers=result.centers) result = result.partition(lengths) if mpi.rank() == 0: with timed("Wrote center indices in %.2f sec.", logger.info): write_centers_indices(args.center_indices, [(t, f * args.subsample) for t, f in result.center_indices]) with timed("Wrote center structures in %.2f sec.", logger.info): write_centers(result, args) write_assignments_and_distances_with_reassign(result, args) mpi.comm.barrier() logger.info("Success! Data can be found in %s.", os.path.dirname(args.distances)) return 0
def load(topologies, trajectories, selections, stride, processes): for top, selection in zip(topologies, selections): sentinel_trj = md.load(top) try: # noop, but causes fast-fail w/bad args.atoms sentinel_trj.top.select(selection) except: raise exception.ImproperlyConfigured( ("The provided selection '{s}' didn't match the topology " "file, {t}").format(s=selection, t=top)) flat_trjs = [] configs = [] n_inds = None for topfile, trjset, selection in zip(topologies, trajectories, selections): top = md.load(topfile).top indices = top.select(selection) if n_inds is not None: if n_inds != len(indices): raise exception.ImproperlyConfigured( ("Selection on topology %s selected %s atoms, but " "other selections selected %s atoms.") % (topfile, len(indices), n_inds)) n_inds = len(indices) for trj in trjset: flat_trjs.append(trj) configs.append({ 'top': top, 'stride': stride, 'atom_indices': indices, }) logger.info( "Loading %s trajectories with %s atoms using %s processes " "(subsampling %s)", len(flat_trjs), len(top.select(selection)), processes, stride) assert len(top.select(selection)) > 0, "No atoms selected for clustering" with timed("Loading took %.1f sec", logger.info): lengths, xyz = load_as_concatenated(flat_trjs, args=configs, processes=processes) with timed("Turned over array in %.2f min", logging.info): tmp_xyz = xyz.copy() del xyz xyz = tmp_xyz logger.info("Loaded %s frames.", len(xyz)) return lengths, xyz, top.subset(top.select(selection))
def load_features(features, stride): if len(features) == 1: with timed("Loading features took %.1f s.", logger.info): try: data = ra.load(features[0]) except tables.exceptions.NoSuchNodeError: data = ra.load(features[0], keys=...) lengths = data.lengths data = data._data else: # and len(features) > 1 with timed("Loading features took %.1f s.", logger.info): lengths, data = mpi.io.load_npy_as_striped(features, stride) with timed("Turned over array in %.2f min", logger.info): tmp_data = data.copy() del data data = tmp_data return lengths, data
def load_features(features, stride): try: if len(features) == 1: with timed("Loading features took %.1f s.", logger.info): lengths, data = mpi.io.load_h5_as_striped(features[0], stride) else: # and len(features) > 1 with timed("Loading features took %.1f s.", logger.info): lengths, data = mpi.io.load_npy_as_striped(features, stride) with timed("Turned over array in %.2f min", logger.info): tmp_data = data.copy() del data data = tmp_data except MemoryError: logger.error( "Ran out of memory trying to allocate features array" " from file %s", features[0]) raise logger.info("Loaded %s trajectories with %s frames with stride %s.", len(lengths), len(data), stride) return lengths, data
def main(argv=None): """Run the driver script for this module. This code only runs if we're being run as a script. Otherwise, it's silent and just exposes methods. """ args = process_command_line(argv) trj_list = load_trajs(args) with timed("Calculating CARDS correlations took %.1f s.", logger.info): ss_mi, dd_mi, sd_mi, ds_mi, inds = cards(trj_list, args.buffer_size, args.processes) logger.info("Completed correlations. ") save_cards(ss_mi, dd_mi, sd_mi, ds_mi, args.matrices) np.savetxt(args.indices, inds, delimiter=",") logger.info("Saved dihedral indices as %s", args.indices) return 0
def main(argv=None): '''Run the driver script for this module. This code only runs if we're being run as a script. Otherwise, it's silent and just exposes methods.''' args = process_command_line(argv) top = md.load(args.topology).top atom_ids = top.select(args.selection) logging.info("Running with %s total workers.", MPI_SIZE) logging.info( "Loading trajectories [%s::%s]; selection == '%s' w/ " "subsampling %s", MPI_RANK, MPI_SIZE, args.selection, args.subsample) with timed("load_as_concatenated took %.2f sec", logging.info): global_lengths, my_xyz = mpi.io.load_as_striped( filenames=args.trajectories, top=top, atom_indices=atom_ids, stride=args.subsample, processes=args.processes) with timed("Turned over array in %.2f min", logging.info): xyz = my_xyz.copy() del my_xyz my_xyz = xyz logging.info("Loaded %s frames in %s trjs (%.2fG).", len(my_xyz), len(args.trajectories) // MPI_SIZE, my_xyz.data.nbytes / 1024**3) trjs = md.Trajectory(my_xyz, topology=top.subset(atom_ids)) logging.info( "Beginning kcenters clustering with memory footprint of %.2fG " "RAM (coords are %.2fG; total VRAM is %.2fG)", resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024**2, trjs.xyz.nbytes / 1024**3, psutil.virtual_memory().total / 1024**3) if len(args.cluster_radii) > 1: raise NotImplementedError( "Multiple cluster radii are not yet supported") tick = time.perf_counter() local_dists, local_assigs, local_ctr_inds = kcenters_mpi( trjs, md.rmsd, dist_cutoff=args.cluster_radii[0]) tock = time.perf_counter() logging.info( "Finished kcenters clustering using %.2fG RAM (coords are " "%.2fG) in %.2f min.", resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024**2, trjs.xyz.nbytes / 1024**3, (tock - tick) / 60) for i in range(args.kmedoids_iters): with timed("KMedoids iteration {i} took %.2f sec".format(i=i), logging.info): local_ctr_inds, local_dists, local_assigs = _kmedoids_pam_update( X=trjs, metric=md.rmsd, medoid_inds=local_ctr_inds, assignments=local_assigs, distances=local_dists, random_state=args.random_state) with timed("Reassembled dist and assign arrays in %.2f sec", logging.info): all_dists = mpi.ops.assemble_striped_ragged_array( local_dists, global_lengths) all_assigs = mpi.ops.assemble_striped_ragged_array( local_assigs, global_lengths) ctr_inds = mpi.ops.convert_local_indices(local_ctr_inds, global_lengths) ctr_inds = partition_indices(ctr_inds, global_lengths) if MPI_RANK == 0: logging.info("Dumping center indices to %s", args.center_indices) with open(args.center_indices, 'wb') as f: pickle.dump([(trj, frame * args.subsample) for trj, frame in ctr_inds], f) if args.distances: ra.save(args.distances, ra.RaggedArray(all_dists, lengths=global_lengths)) if args.assignments: ra.save(args.assignments, ra.RaggedArray(all_assigs, lengths=global_lengths)) centers = load_frames(args.trajectories, ctr_inds, stride=args.subsample, top=md.load(args.topology).top) with open(args.center_structures, 'wb') as f: pickle.dump(centers, f) logging.info("Wrote %s centers to %s", len(centers), args.center_structures) return 0
def reassign(topologies, trajectories, atoms, centers, frac_mem=0.5): """Reassign a set of trajectories based on a subset of atoms and centers. Parameters ---------- topologies : list List of topologies corresponding to the trajectories to be reassigned. trajectories : list of lists List of lists of tajectories to be loaded in batches and reassigned. atoms : list List of MDTraj atom query strings. Each string is applied to the corresponding topology to choose which atoms will be used for the reassignment. centers : md.Trajectory or list of trajectories The atoms representing the centers to reassign to. frac_mem : float, default=0.5 The fraction of main RAM to use for trajectories. A lower number will mean more batches. """ n_procs = enspara.util.parallel.auto_nprocs() # check input validity if len(topologies) != len(trajectories): raise enspara.exception.ImproperlyConfigured( "Number of topologies (%s) didn't match number of sets of " "trajectories (%s)." % (len(topologies), len(trajectories))) if len(topologies) != len(atoms): raise enspara.exception.ImproperlyConfigured( "Number of topologies (%s) didn't match number of atom selection " "strings (%s)." % (len(topologies), len(atoms))) # iteration across md.Trajectory is insanely slow. Do it only once here. if isinstance(centers, md.Trajectory): tick = time.perf_counter() logger.info('Centers are an md.Trajectory. Creating trj-list to ' 'avoid repeated iteration.') # using in-place copies to reduce memory usage (and for speed) centers = [centers.slice(i, copy=False) for i in range(len(centers))] logger.info('Built trj list in %.1f seconds.', time.perf_counter() - tick) # precenter centers (there will be many RMSD calcs here) for c in centers: c.center_coordinates() with timed("Reassignment took %.1f seconds.", logger.info): # build flat list of targets targets = [] for topfile, trjfiles, atoms in zip(topologies, trajectories, atoms): t = md.load(topfile).top atom_ids = t.select(atoms) for trjfile in trjfiles: assert os.path.exists(trjfile) targets.append((trjfile, t, atom_ids)) # determine trajectory length tick_sounding = time.perf_counter() logger.info("Sounding dataset of %s trajectories and %s topologies.", sum(len(t) for t in trajectories), len(topologies)) lengths = Parallel(n_jobs=n_procs)( delayed(sound_trajectory)(f) for f, _, _ in targets) logger.info("Sounded %s trajectories with %s frames (median length " "%i frames) in %.1f seconds.", len(lengths), sum(lengths), np.median(lengths), time.perf_counter() - tick_sounding) assignments, distances = batch_reassign( targets, centers, lengths, frac_mem=frac_mem, n_procs=n_procs) if all([len(assignments[0]) == len(a) for a in assignments]): logger.info("Trajectory lengths are homogenous. Output will " "be np.ndarrays.") assert all([len(distances[0]) == len(d) for d in distances]) return np.array(assignments), np.array(distances) else: logger.info("Trajectory lengths are heterogenous. Output will " "be ra.RaggedArrays.") return ra.RaggedArray(assignments), ra.RaggedArray(distances)
def batch_reassign(targets, centers, lengths, frac_mem, n_procs=None): example_center = centers[0] DTYPE_BYTES = 4 batch_size, batch_gb = determine_batch_size( example_center.n_atoms, DTYPE_BYTES, frac_mem) logger.info( 'Batch max size set to %s frames (~%.2f GB, %.1f%% of total RAM).' % (batch_size, batch_gb, frac_mem*100)) if batch_size < max(lengths): raise enspara.exception.ImproperlyConfigured( 'Batch size of %s was smaller than largest file (size %s).' % (batch_size, max(lengths))) batches = compute_batches(lengths, batch_size) assignments = [] distances = [] for i, batch_indices in enumerate(batches): tick = time.perf_counter() logger.info("Starting batch %s of %s", i+1, len(batches)) batch_targets = [targets[i] for i in batch_indices] with timed("Loaded frames for batch in %.1f seconds", logger.info): batch_lengths, xyz = load_as_concatenated( [tfile for tfile, top, aids in batch_targets], lengths=[lengths[i] for i in batch_indices], args=[{'top': top, 'atom_indices': aids} for t, top, aids in batch_targets], processes=n_procs) # mdtraj loads as float32, and load_as_concatenated should thus # also load as float32. This should _never_ be hit, but there might be # some platform-specific situation where double != float64? assert xyz.dtype.itemsize == DTYPE_BYTES trj = md.Trajectory(xyz, topology=example_center.top) with timed("Precentered trajectories in %.1f seconds", logger.debug): trj.center_coordinates() with timed("Assigned trajectories in %.1f seconds", logger.debug): batch_assignments, batch_distances = assign_to_nearest_center( trj, centers, partial(md.rmsd, precentered=True)) # clear memory of xyz and trj to allow cleanup to deallocate # these large arrays; may help with memory high-water mark with timed("Cleared array from memory in %.1f seconds", logger.debug): xyz_size = xyz.size del trj, xyz assignments.extend(partition_list(batch_assignments, batch_lengths)) distances.extend(partition_list(batch_distances, batch_lengths)) logger.info( "Finished batch %s of %s in %.1f seconds. Coordinates array had " "memory footprint of %.2f GB (of memory high-water mark %.2f/%.2f " "GB).", i, len(batches), time.perf_counter() - tick, xyz_size * DTYPE_BYTES / 1024**3, resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024**2, psutil.virtual_memory().total / 1024**3) return assignments, distances