def frames_from_files(files, top, frames, chunksize=1000, stride=1, verbose=False, copy_not_join=None, reader=None): """ Constructs a Trajectory object out of given frames collected from files (or given reader). :param files: source files :param top: topology file :param frames: indices :param chunksize: :param stride: :param verbose: :param copy_not_join: not used :param reader: if a reader is given, ignore files and top param! :return: mdtra.Trajectory consisting out of frames indices. """ # Enforce topology to be a md.Topology object if reader is None: top = _enforce_top(top) else: if not reader.number_of_trajectories(): raise ValueError("need at least one trajectory file in reader.") if isinstance(reader, FragmentedTrajectoryReader): top = reader._readers[0][0].featurizer.topology elif isinstance(reader, FeatureReader): top = reader.featurizer.topology else: raise ValueError("unsupported reader (only md readers).") stride = int(stride) frames = np.array(frames) # only one file, so we expect frames to be a one dimensional array if isinstance(files, str): files = [files] if frames.ndim == 1: # insert a constant column for file index frames = np.insert(np.atleast_2d(frames), 0, np.zeros_like(frames), axis=0).T if stride != 1: frames[:, 1] *= int(stride) if verbose: log.info('A stride value of = %u was parsed, ' 'interpreting "indexes" accordingly.' % stride) # sort by file and frame index sort_inds = np.lexsort((frames[:, 1], frames[:, 0])) sorted_inds = frames[sort_inds] assert len(sorted_inds) == len(frames) file_inds_unique = np.unique(sorted_inds[:, 0]) # construct reader if reader is None: # filter out files, we would never read, because no indices are pointing to them reader = source(np.array(files)[file_inds_unique].tolist(), top=top) # re-map indices to reflect filtered files: for itraj, c in zip(file_inds_unique, itertools.count(0)): mask = sorted_inds[:, 0] == itraj sorted_inds[mask, 0] = c inds_to_check = np.arange(len(file_inds_unique)) else: inds_to_check = file_inds_unique # sanity check of indices for itraj in inds_to_check: inds_by_traj = sorted_inds[sorted_inds[:, 0] == itraj] largest_ind_in_traj = np.max(inds_by_traj) length = reader.trajectory_length(itraj) if length < largest_ind_in_traj: raise ValueError( "largest specified index (%i * stride=%i * %i=%i) " "is larger than trajectory length '%s' = %i" % (largest_ind_in_traj / stride, largest_ind_in_traj / stride, stride, largest_ind_in_traj, reader.filenames[itraj], length)) # we want the FeatureReader to return mdtraj.Trajectory objects if isinstance(reader, FeatureReader): reader._return_traj_obj = True elif isinstance(reader, FragmentedTrajectoryReader): for file in reader.filenames_flat: r = reader.reader_by_filename(file) if isinstance(r, FeatureReader): r = [r] for _r in r: _r._return_traj_obj = True it = reader.iterator(chunk=chunksize, stride=sorted_inds, return_trajindex=False) reporter = ProgressReporter() reporter._progress_register(it._n_chunks, description="collecting frames") collected_frames = [] with it: for x in it: collected_frames.append(x) reporter._progress_update(1) reporter._progress_force_finish() dest = _preallocate_empty_trajectory(top, len(frames)) i = 0 for chunk in collected_frames: _copy_traj_attributes(dest, chunk, i) i += len(chunk) dest = dest.slice(sort_inds.argsort(), copy=False) return dest
def frames_from_files(files, top, frames, chunksize=1000, stride=1, verbose=False, copy_not_join=None, reader=None): """ Constructs a Trajectory object out of given frames collected from files (or given reader). :param files: source files :param top: topology file :param frames: indices :param chunksize: :param stride: :param verbose: :param copy_not_join: not used :param reader: if a reader is given, ignore files and top param! :return: mdtra.Trajectory consisting out of frames indices. """ # Enforce topology to be a md.Topology object if reader is None: top = _enforce_top(top) reader_given = False else: if not reader.number_of_trajectories(): raise ValueError("need at least one trajectory file in reader.") if isinstance(reader, FragmentedTrajectoryReader): top = reader._readers[0][0].featurizer.topology elif isinstance(reader, FeatureReader): top = reader.featurizer.topology else: raise ValueError("unsupported reader (only md readers).") reader_given = True stride = int(stride) frames = np.array(frames) # only one file, so we expect frames to be a one dimensional array if isinstance(files, str): files = [files] if frames.ndim == 1: # insert a constant column for file index frames = np.insert(np.atleast_2d(frames), 0, np.zeros_like(frames), axis=0).T if stride != 1: frames[:, 1] *= stride if verbose: log.info('A stride value of = %u was parsed, ' 'interpreting "indexes" accordingly.' % stride) # sort by file and frame index sort_inds = np.lexsort((frames[:, 1], frames[:, 0])) sorted_inds = frames[sort_inds] assert len(sorted_inds) == len(frames) file_inds_unique = np.unique(sorted_inds[:, 0]) # construct reader if reader is None: # filter out files, we would never read, because no indices are pointing to them reader = source(np.array(files)[file_inds_unique].tolist(), top=top) # re-map indices to reflect filtered files: for itraj, c in zip(file_inds_unique, itertools.count(0)): mask = sorted_inds[:, 0] == itraj sorted_inds[mask, 0] = c inds_to_check = np.arange(len(file_inds_unique)) else: inds_to_check = file_inds_unique # sanity check of indices for itraj in inds_to_check: inds_by_traj = sorted_inds[sorted_inds[:, 0] == itraj][:, 1] assert inds_by_traj.ndim == 1 largest_ind_in_traj = np.max(inds_by_traj) length = reader.trajectory_length(itraj) if largest_ind_in_traj >= length: raise ValueError( "largest specified index ({largest_without_stride} * stride=" "{largest_without_stride} * {stride}={largest}) " "is larger than trajectory length '{filename}' = {length}". format(largest_without_stride=largest_ind_in_traj / stride, stride=stride, largest=largest_ind_in_traj, filename=reader.filenames[itraj], length=length)) def set_reader_return_traj_objects(reader, flag): if isinstance(reader, FeatureReader): reader._return_traj_obj = flag elif isinstance(reader, FragmentedTrajectoryReader): for file in reader.filenames_flat: r = reader.reader_by_filename(file) if isinstance(r, FeatureReader): r = [r] for _r in r: _r._return_traj_obj = flag try: # If the reader got passed in, it could have the data already mapped to memory. # In this case, we cannot force it to return trajectory objects, so we have to re-create it. if reader.in_memory: reader = source(reader.filenames, top=top, chunksize=chunksize) # we want the FeatureReader to return mdtraj.Trajectory objects set_reader_return_traj_objects(reader, True) it = reader.iterator(chunk=chunksize, stride=sorted_inds, return_trajindex=False) with it: collected_frames = [f for f in it] dest = _preallocate_empty_trajectory(top, len(frames)) t = 0 for chunk in collected_frames: _copy_traj_attributes(dest, chunk, t) t += len(chunk) # reshuffle the indices of the final trajectory object to obtain the desired order dest = dest.slice(sort_inds.argsort(), copy=False) finally: # in any case we want to reset the reader to its previous state (return features, instead of md.Trajectory) if reader_given: set_reader_return_traj_objects(reader, False) return dest
def frames_from_file(file_name, top, frames, chunksize=100, stride=1, verbose=False, copy_not_join=False): r"""Reads one "file_name" molecular trajectory and returns an mdtraj trajectory object containing only the specified "frames" in the specified order. Extracts the specified sequence of time/trajectory indexes from the input loader and saves it in a molecular dynamics trajectory. The output format will be determined by the outfile name. Parameters ---------- file_name: str. Absolute path to the molecular trajectory file, ex. trajout.xtc top : str, mdtraj.Trajectory, or mdtraj.Topology Topology information to load the molecular trajectroy file in :py:obj:`file_name` frames : ndarray of shape (n_frames, ) and integer type Contains the frame indices to be retrieved from "file_name". There is no restriction as to what this array has to look like other than: - positive integers - <= the total number of frames in "file_name". "frames" need not be monotonous or unique, i.e, arrays like [3, 1, 4, 1, 5, 9, 9, 9, 9, 3000, 0, 0, 1] are welcome verbose: boolean. Level of verbosity while looking for "frames". Useful when using "chunksize" with large trajectories. It provides the no. of frames accumulated for every chunk. stride : integer, default is 1 This parameter informs :py:func:`save_traj` about the stride used in :py:obj:`indexes`. Typically, :py:obj:`indexes` contains frame-indexes that match exactly the frames of the files contained in :py:obj:`traj_inp.trajfiles`. However, in certain situations, that might not be the case. Examples are cases in which a stride value != 1 was used when reading/featurizing/transforming/discretizing the files contained in :py:obj:`traj_inp.trajfiles`. copy_not_join : boolean, default is False This parameter decides how geometry objects are appended onto one another. If left to False, mdtraj's own :py:obj:`join` method will be used, which is the recommended method. However, for some combinations of py:obj:`chunksizes` and :py:obj:`frames` this might be not very effective. If one sets :py:obj:`copy_not_join` to True, the returned :py:obj:`traj` is preallocated and the important attributes (currently traj.xyz, traj.time, traj.unit_lengths, traj.unit_angles) are broadcasted onto it. Returns ------- traj : an md trajectory object containing the frames specified in "frames", in the order specified in "frames". """ assert isinstance(frames, np.ndarray), "input frames frames must be a numpy ndarray, got %s instead "%type(frames) assert np.ndim(frames) == 1, "input frames frames must have ndim = 1, got np.ndim = %u instead "%np.ndim(frames) assert isinstance(file_name, str), "input file_name must be a string, got %s instead"%type(file_name) assert isinstance(top, (str, md.Trajectory, md.Topology)), "input topology must of one of type: " \ "str, mdtraj.Trajectory, or mdtraj.Topology. " \ "Got %s instead" % type(top) # Enforce topology to be a md.Topology object top = _enforce_top(top) # Prepare the trajectory object if copy_not_join: traj = _preallocate_empty_trajectory(top, frames.shape[0]) else: traj = None # Prepare the running number of accumulated frames cum_frames = 0 # Because the trajectory is streamed "chronologically", but "frames" can have any arbitrary order # we store that order in "orig_order" to reshuffle the traj at the end orig_order = frames.argsort().argsort() sorted_frames = np.sort(frames) for jj, traj_chunk in enumerate(md.iterload(file_name, top=top, chunk=chunksize, stride=stride)): # Create an indexing array for this trajchunk i_idx = jj*chunksize f_idx = i_idx+chunksize chunk_frames = np.arange(i_idx, f_idx)[:traj_chunk.n_frames] # Frames that appear more than one time will be kept good_frames = np.hstack([np.argwhere(ff == chunk_frames).squeeze() for ff in sorted_frames]) # Keep the good frames of this chunk if np.size(good_frames) > 0: if copy_not_join: # => traj has been already preallocated, see above traj = _copy_traj_attributes(traj, traj_chunk[good_frames], cum_frames) elif traj is None: # => copy_not_join is False AND 1st run traj = traj_chunk[good_frames] else: # => copy_not_join is False AND we're not on the 1st run traj = traj.join(traj_chunk[good_frames]) cum_frames += np.size(good_frames) if verbose: log.info('chunk %u of traj has size %u, indices %6u...%6u. Accumulated frames %u' % (jj, traj_chunk.n_frames, chunk_frames[0], chunk_frames[-1], cum_frames)) # Check if we can already stop iterating if chunk_frames[-1] >= frames.max(): break # Make sure that "frames" did not contain impossible frames if (frames > chunk_frames[-1]).any(): raise Exception('Cannot provide frames %s for trajectory %s with n_frames = %u' % (frames[frames > chunk_frames[-1]], file_name, chunk_frames[-1])) if stride != 1 and verbose: log.info('A stride value of = %u was parsed, interpreting "indexes" accordingly.'%stride) # Trajectory coordinates are is returned "reshuffled" return traj[orig_order]