def test_assign_to_nearest_center_few_centers(): # assign_to_nearest_center takes two code paths, one for # n_centers > n_frames and one for n_frames > n_centers. This tests # the latter. trj = md.load(get_fn('frame0.xtc'), top=get_fn('native.pdb')) center_frames = [0, int(len(trj) / 3), int(len(trj) / 2)] assigns, distances = util.assign_to_nearest_center(trj, trj[center_frames], md.rmsd) alldists = np.zeros((len(center_frames), len(trj))) for i, center_frame in enumerate(trj[center_frames]): alldists[i] = md.rmsd(trj, center_frame) assert_allclose(np.min(alldists, axis=0), distances, atol=1e-3) assert_array_equal(np.argmin(alldists, axis=0), assigns)
def batch_reassign(targets, centers, lengths, frac_mem, n_procs=None): example_center = centers[0] DTYPE_BYTES = 4 batch_size, batch_gb = determine_batch_size( example_center.n_atoms, DTYPE_BYTES, frac_mem) logger.info( 'Batch max size set to %s frames (~%.2f GB, %.1f%% of total RAM).' % (batch_size, batch_gb, frac_mem*100)) if batch_size < max(lengths): raise enspara.exception.ImproperlyConfigured( 'Batch size of %s was smaller than largest file (size %s).' % (batch_size, max(lengths))) batches = compute_batches(lengths, batch_size) assignments = [] distances = [] for i, batch_indices in enumerate(batches): tick = time.perf_counter() logger.info("Starting batch %s of %s", i+1, len(batches)) batch_targets = [targets[i] for i in batch_indices] with timed("Loaded frames for batch in %.1f seconds", logger.info): batch_lengths, xyz = load_as_concatenated( [tfile for tfile, top, aids in batch_targets], lengths=[lengths[i] for i in batch_indices], args=[{'top': top, 'atom_indices': aids} for t, top, aids in batch_targets], processes=n_procs) # mdtraj loads as float32, and load_as_concatenated should thus # also load as float32. This should _never_ be hit, but there might be # some platform-specific situation where double != float64? assert xyz.dtype.itemsize == DTYPE_BYTES trj = md.Trajectory(xyz, topology=example_center.top) with timed("Precentered trajectories in %.1f seconds", logger.debug): trj.center_coordinates() with timed("Assigned trajectories in %.1f seconds", logger.debug): batch_assignments, batch_distances = assign_to_nearest_center( trj, centers, partial(md.rmsd, precentered=True)) # clear memory of xyz and trj to allow cleanup to deallocate # these large arrays; may help with memory high-water mark with timed("Cleared array from memory in %.1f seconds", logger.debug): xyz_size = xyz.size del trj, xyz assignments.extend(partition_list(batch_assignments, batch_lengths)) distances.extend(partition_list(batch_distances, batch_lengths)) logger.info( "Finished batch %s of %s in %.1f seconds. Coordinates array had " "memory footprint of %.2f GB (of memory high-water mark %.2f/%.2f " "GB).", i, len(batches), time.perf_counter() - tick, xyz_size * DTYPE_BYTES / 1024**3, resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024**2, psutil.virtual_memory().total / 1024**3) return assignments, distances