def write_trajectory(self, clone_dir, output_dir, trajectory_number, stride, max_rmsd, min_gens, center_conformations, memory_check, omp_parallel_rmsd=True): """ This function takes in a path to a CLONE and merges all the XTC files it finds into a H5 trajectory: Parameters ---------- clone_dir : str the directory in which the xtc files are found. All of the xtc files in this directory are joined together to make a single trajectory (.h5) output file output_dir : str directory where the outputted files will be placed trajectory_number : int A unique number for this trajectory. This number is used in constructing the filename to write the outputted .h5 trajectory to, and thus must be unique stride: int Subsample by only considering every Nth snapshop. max_rmsd: {int, None} if this value is not None, calculate the RMSD to the pdb_file from each snapshot and reject trajectories which have snapshots with RMSD greated than max_rmsd. If None, no check is performed min_gens : int Discard the trajectories that contain fewer than `min_gens` XTC files. center_conformations : bool center conformations before saving. memory_check : bool if yes, uses the memory dictionary to do an update rather than a complete re-convert. omp_parallel_rmsd : bool If true, use OpenMP accelerated RMSD calculation for max_rmsd check """ xtc_files = self.list_xtcs_in_dir(clone_dir) # Ensure that we're only joining contiguously numbered xtc files -- starting at 0 -- # into a trajectory. If there are gaps in the xtc files in the directory, we only # want to use the the ones such that they are contiguously numbered i = 0 for i, filename in enumerate(xtc_files): if self.integer_component(filename) != i: logger.error("Found discontinuity in xtc numbering - check data in %s", clone_dir) xtc_files = xtc_files[0:i] break # check the memory object to see which xtc files have already been converted, and # exclude those from this conversion if memory_check: if clone_dir in self.memory.keys(): previous_convert_exists = True num_xtcs_converted = self.memory[clone_dir][1] if len(xtc_files) == num_xtcs_converted: # if we have converted everything, logger.info("Already converted all files in %s, skipping...", clone_dir) return # just bail out else: xtc_files = xtc_files[num_xtcs_converted:] else: previous_convert_exists = False else: previous_convert_exists = False xtc_file_paths = [os.path.join(clone_dir, f) for f in xtc_files] logger.info("Processing %d xtc files in clone_dir = %s", len(xtc_files), clone_dir) if len(xtc_files) <= min_gens: logger.info("Skipping trajectory in clone_dir = %s", clone_dir) logger.info("Too few xtc files (generations).") return try: # [this should check for and discard overlapping snapshots] trajectory = Trajectory.load_from_xtc(xtc_file_paths, PDBFilename=self.pdb_topology, discard_overlapping_frames=True) except IOError as e: logger.error("IOError (%s) when processing trajectory in clone_dir = %s", e, clone_dir) logger.error("Attempting rescue by disregarding final frame, which is often") logger.error("the first/only frame to be corrupted") if len(xtc_file_paths) == 1: logger.error("Didn't find any other frames in %s, continuing...", clone_dir) return try: trajectory = Trajectory.load_from_xtc(xtc_file_paths[0:-1], PDBFilename=self.pdb_topology) except IOError: logger.error("Unfortunately, the error remained even after ignoring the final frame.") logger.error("Skipping the trajectory in clone_dir = %s", clone_dir) return else: logger.error("Sucessfully recovered from IOError by disregarding final frame.") if max_rmsd is not None: atomindices = [ int(i)-1 for i in trajectory['AtomID'] ] rmsdmetric = RMSD(atomindices, omp_parallel=omp_parallel_rmsd) ppdb = rmsdmetric.prepare_trajectory(Trajectory.load_trajectory_file(self.pdb_topology)) ptraj = rmsdmetric.prepare_trajectory(trajectory) rmsds = rmsdmetric.one_to_all(ppdb, ptraj, 0) if max(rmsds) > max_rmsd: logger.warning("Snapshot %d RMSD %f > the %f cutoff" , argmax(rmsds), max(rmsds), max_rmsd) logger.warning("Dropping trajectory") return if center_conformations: RMSD.TheoData.centerConformations(trajectory["XYZList"]) # if we are adding to a previous trajectory, we have to load that traj up and extend it if previous_convert_exists: output_filename = self.memory[clone_dir][0] output_file_path = output_filename logger.info("Extending: %s", output_filename) assert os.path.exists( output_filename ) # load the traj and extend it [this should check for and discard overlapping snapshots] Trajectory.append_frames_to_file( output_filename, trajectory['XYZList'][::stride], discard_overlapping_frames=True ) num_xtcs_processed = len(xtc_file_paths) + self.memory[clone_dir][1] # if we are not adding to a traj, then we create a new one else: output_filename = 'trj%s.h5' % trajectory_number output_file_path = os.path.join(output_dir, output_filename) if os.path.exists(output_file_path): logger.info("The file name %s already exists. Skipping it.", output_file_path) return # stide and discard by snapshot trajectory['XYZList'] = trajectory['XYZList'][::stride] trajectory.save(output_file_path) num_xtcs_processed = len(xtc_file_paths) # log what we did into the memory object self.memory[clone_dir] = [ output_file_path, num_xtcs_processed ] return
def write_trajectory(self, clone_dir, output_dir, trajectory_number, stride, max_rmsd, min_gens, center_conformations, memory_check, omp_parallel_rmsd=True): """ This function takes in a path to a CLONE and merges all the XTC files it finds into a LH5 trajectory: Parameters ---------- clone_dir : str the directory in which the xtc files are found. All of the xtc files in this directory are joined together to make a single trajectory (.lh5) output file output_dir : str directory where the outputted files will be placed trajectory_number : int A unique number for this trajectory. This number is used in constructing the filename to write the outputted .lh5 trajectory to, and thus must be unique stride: int Subsample by only considering every Nth snapshop. max_rmsd: {int, None} if this value is not None, calculate the RMSD to the pdb_file from each snapshot and reject trajectories which have snapshots with RMSD greated than max_rmsd. If None, no check is performed min_gens : int Discard the trajectories that contain fewer than `min_gens` XTC files. center_conformations : bool center conformations before saving. memory_check : bool if yes, uses the memory dictionary to do an update rather than a complete re-convert. omp_parallel_rmsd : bool If true, use OpenMP accelerated RMSD calculation for max_rmsd check """ xtc_files = self.list_xtcs_in_dir(clone_dir) # Ensure that we're only joining contiguously numbered xtc files -- starting at 0 -- # into a trajectory. If there are gaps in the xtc files in the directory, we only # want to use the the ones such that they are contiguously numbered i = 0 for i, filename in enumerate(xtc_files): if self.integer_component(filename) != i: logger.error( "Found discontinuity in xtc numbering - check data in %s", clone_dir) xtc_files = xtc_files[0:i] break # check the memory object to see which xtc files have already been converted, and # exclude those from this conversion if memory_check: if clone_dir in self.memory.keys(): previous_convert_exists = True num_xtcs_converted = self.memory[clone_dir][1] if len( xtc_files ) == num_xtcs_converted: # if we have converted everything, logger.info( "Already converted all files in %s, skipping...", clone_dir) return # just bail out else: xtc_files = xtc_files[num_xtcs_converted:] else: previous_convert_exists = False else: previous_convert_exists = False xtc_file_paths = [os.path.join(clone_dir, f) for f in xtc_files] logger.info("Processing %d xtc files in clone_dir = %s", len(xtc_files), clone_dir) if len(xtc_files) <= min_gens: logger.info("Skipping trajectory in clone_dir = %s", clone_dir) logger.info("Too few xtc files (generations).") return try: # [this should check for and discard overlapping snapshots] trajectory = Trajectory.load_from_xtc( xtc_file_paths, PDBFilename=self.pdb_topology, discard_overlapping_frames=True) except IOError as e: logger.error( "IOError (%s) when processing trajectory in clone_dir = %s", e, clone_dir) logger.error( "Attempting rescue by disregarding final frame, which is often" ) logger.error("the first/only frame to be corrupted") if len(xtc_file_paths) == 1: logger.error( "Didn't find any other frames in %s, continuing...", clone_dir) return try: trajectory = Trajectory.load_from_xtc( xtc_file_paths[0:-1], PDBFilename=self.pdb_topology) except IOError: logger.error( "Unfortunately, the error remained even after ignoring the final frame." ) logger.error("Skipping the trajectory in clone_dir = %s", clone_dir) return else: logger.error( "Sucessfully recovered from IOError by disregarding final frame." ) if max_rmsd is not None: atomindices = [int(i) - 1 for i in trajectory['AtomID']] rmsdmetric = RMSD(atomindices, omp_parallel=omp_parallel_rmsd) ppdb = rmsdmetric.prepare_trajectory( Trajectory.load_trajectory_file(self.pdb_topology)) ptraj = rmsdmetric.prepare_trajectory(trajectory) rmsds = rmsdmetric.one_to_all(ppdb, ptraj, 0) if max(rmsds) > max_rmsd: logger.warning("Snapshot %d RMSD %f > the %f cutoff", argmax(rmsds), max(rmsds), max_rmsd) logger.warning("Dropping trajectory") return if center_conformations: RMSD.TheoData.centerConformations(trajectory["XYZList"]) # if we are adding to a previous trajectory, we have to load that traj up and extend it if previous_convert_exists: output_filename = self.memory[clone_dir][0] output_file_path = output_filename logger.info("Extending: %s", output_filename) assert os.path.exists(output_filename) # load the traj and extend it [this should check for and discard overlapping snapshots] Trajectory.append_frames_to_file(output_filename, trajectory['XYZList'][::stride], discard_overlapping_frames=True) num_xtcs_processed = len( xtc_file_paths) + self.memory[clone_dir][1] # if we are not adding to a traj, then we create a new one else: output_filename = 'trj%s.lh5' % trajectory_number output_file_path = os.path.join(output_dir, output_filename) if os.path.exists(output_file_path): logger.info("The file name %s already exists. Skipping it.", output_file_path) return # stide and discard by snapshot trajectory['XYZList'] = trajectory['XYZList'][::stride] trajectory.save(output_file_path) num_xtcs_processed = len(xtc_file_paths) # log what we did into the memory object self.memory[clone_dir] = [output_file_path, num_xtcs_processed] return