def run(project, atom_indices=None, traj_fn = 'all'): n_atoms = project.load_conf()['XYZList'].shape[1] if traj_fn.lower() == 'all': SASA = np.ones((project.n_trajs, np.max(project.traj_lengths), n_atoms)) * -1 for traj_ind in xrange(project.n_trajs): traj_asa = [] logger.info("Working on Trajectory %d", traj_ind) traj_fn = project.traj_filename(traj_ind) chunk_ind = 0 for traj_chunk in Trajectory.enum_chunks_from_lhdf( traj_fn, AtomIndices=atom_indices ): #print chunk_ind traj_asa.extend(asa.calculate_asa(traj_chunk, n_sphere_points = 24)) chunk_ind += 1 SASA[traj_ind, 0:project.traj_lengths[traj_ind]] = traj_asa else: traj_asa = [] for traj_chunk in Trajectory.enum_chunks_from_lhdf( traj_fn, AtomIndices=atom_indices ): traj_asa.extend( asa.calculate_asa( traj_chunk ) ) SASA = np.array(traj_asa) return SASA
def run(project, atom_indices=None, traj_fn='all'): n_atoms = project.load_conf()['XYZList'].shape[1] if traj_fn.lower() == 'all': SASA = np.ones( (project.n_trajs, np.max(project.traj_lengths), n_atoms)) * -1 for traj_ind in xrange(project.n_trajs): traj_asa = [] logger.info("Working on Trajectory %d", traj_ind) traj_fn = project.traj_filename(traj_ind) chunk_ind = 0 for traj_chunk in Trajectory.enum_chunks_from_lhdf( traj_fn, AtomIndices=atom_indices): #print chunk_ind traj_asa.extend( asa.calculate_asa(traj_chunk, n_sphere_points=24)) chunk_ind += 1 SASA[traj_ind, 0:project.traj_lengths[traj_ind]] = traj_asa else: traj_asa = [] for traj_chunk in Trajectory.enum_chunks_from_lhdf( traj_fn, AtomIndices=atom_indices): traj_asa.extend(asa.calculate_asa(traj_chunk)) SASA = np.array(traj_asa) return SASA
def run( project, output, num_procs=1, chunk_size=50000, traj_fn='all' ): pool = mp.Pool( num_procs ) dssp_assignments = [] if traj_fn.lower() == 'all': for i in xrange( project.n_trajs ): traj_dssp_assignments = [] N = project.traj_lengths[i] j = 0 for trj_chunk in Trajectory.enum_chunks_from_lhdf( project.traj_filename( i ), ChunkSize=chunk_size ): result = pool.map_async( analyze_conf, trj_chunk['XYZList'] ) result.wait() traj_dssp_assignments.extend( result.get() ) j+=len(trj_chunk) print "Trajectory %d: %d / %d" % (i, j, N) dssp_assignments.append( traj_dssp_assignments ) else: traj_dssp_assignments = [] N = Trajectory.load_from_lhdf(traj_fn, JustInspect=True)[0] j = 0 for trj_chunk in Trajectory.enum_chunks_from_lhdf(traj_fn, ChunkSize=chunk_size): result = pool.map_async(analyze_conf, trj_chunk['XYZList']) result.wait() traj_dssp_assignments.extend(result.get()) j+=len(trj_chunk) print "Trajectory %s: %d / %d" % (traj_fn, j, N) dssp_assignments.append(traj_dssp_assignments) dssp_assignments = np.array( dssp_assignments ) np.save( output, dssp_assignments ) DEVNULL.close()
def run(prep_metric, project, delta_time, atom_indices=None, output='tICAData.h5', min_length=0, stride=1): # We will load the trajectories at the stride, so we need to find # what dt should be once we've strided by some amount lag = delta_time / stride if (float(delta_time) / stride) != lag: raise Exception("Stride must be a divisor of delta_time.") if lag > 0: # Then we're doing tICA tica_obj = tICA(lag=lag, calc_cov_mat=True, prep_metric=prep_metric) else: # If lag is zero, this is equivalent to regular PCA tica_obj = tICA(lag=lag, calc_cov_mat=False, prep_metric=prep_metric) for i in xrange(project.n_trajs): logger.info("Working on trajectory %d" % i) if project.traj_lengths[i] <= lag: logger.info("Trajectory is not long enough for this lag " "(%d vs %d)", project.traj_lengths[i], lag) continue if project.traj_lengths[i] < min_length: logger.info("Trajectory is not longer than min_length " "(%d vs %d)", project.traj_lengths[i], min_length) continue for traj_chunk in Trajectory.enum_chunks_from_lhdf(project.traj_filename(i), Stride=stride, AtomIndices=atom_indices): tica_obj.train(trajectory=traj_chunk) tica_obj.solve() tica_obj.save(output) logger.info("Saved output to %s", output) return tica_obj
def load_prep_trajectories(project, stride, atom_indices, metric): """load the trajectories but prepare them during the load. This is helpful for metrics that use dimensionality reduction so you can use more frames without a MemoryError """ list_of_ptrajs = [] which = [] for i in xrange(project.n_trajs): which_frames = np.arange(0, project.traj_lengths[i], stride) which.extend(zip([i] * len(which_frames), which_frames)) ptraj = [] for trj_chunk in Trajectory.enum_chunks_from_lhdf(project.traj_filename(i), Stride=stride, AtomIndices=atom_indices): ptrj_chunk = metric.prepare_trajectory(trj_chunk) ptraj.append(ptrj_chunk) ptraj = np.concatenate(ptraj) list_of_ptrajs.append(ptraj) return list_of_ptrajs, np.array(which)
else: AvgCMs_1d = None if Ass.max() < 250: pp = PdfPages(args.out_plot) else: pp = None chunk_size = 10000 if AvgCMs_1d == None: for traj_ind in xrange(Ass.shape[0]): logger.info("Working on %s" % Proj.traj_filename(traj_ind)) for chunk_ind, trj_chunk in enumerate( Trajectory.enum_chunks_from_lhdf(Proj.traj_filename(traj_ind), ChunkSize=chunk_size) ): logger.debug("chunked") ptrj_chunk = get_hb(trj_chunk).astype(float) ass_chunk = Ass[traj_ind][ chunk_ind * chunk_size : (chunk_ind + 1) * chunk_size ] # this behaves as you want at the end of the array for i, ass in enumerate(ass_chunk): if ass == -1: continue CMs_1d[ass] += ptrj_chunk[i] # StateAssigns = np.array([ np.where( Ass == i )[0].shape[0] for i in np.unique( Ass[ np.where( Ass >= 0 ) ] )] ) StateAssigns = np.bincount(Ass[np.where(Ass != -1)], minlength=Ass.max() + 1) StateAssigns = StateAssigns.reshape((len(StateAssigns), 1))
def assign_with_checkpoint(metric, project, generators, assignments_path, distances_path, chunk_size=10000): """ Assign every frame to its closest generator Parameters ---------- metric : msmbuilder.metrics.AbstractDistanceMetric A distance metric used to define "closest" project : msmbuilder.Project Used to load the trajectories generators : msmbuilder.Trajectory A trajectory containing the structures of all of the cluster centers assignments_path : str Path to a file that contains/will contain the assignments, as a 2D array of integers in hdf5 format distances_path : str Path to a file that contains/will contain the assignments, as a 2D array of integers in hdf5 format chunk_size : int The number of frames to load and process per step. The optimal number here depends on your system memory -- it should probably be roughly the number of frames you can fit in memory at any one time. Note, this is only important if your trajectories are long, as the effective chunk_size is really `min(traj_length, chunk_size)` Notes ----- The results will be checkpointed along the way, trajectory by trajectory. So if the process is killed, it should be able to roughly pick up where it left off. """ pgens = metric.prepare_trajectory(generators) # setup the file handles fh_a, fh_d = _setup_containers(project, assignments_path, distances_path) for i in xrange(project.n_trajs): if fh_a.root.completed_trajs[i] and fh_d.root.completed_trajs[i]: logger.info('Skipping trajectory %s -- already assigned', i) continue if fh_a.root.completed_trajs[i] or fh_d.root.completed_trajs[i]: raise RuntimeError('Corruption detected') logger.info('Assigning trajectory %s', i) # pointer to the position in the total trajectory where # the current chunk starts, so we know where in the Assignments # array to put each batch of data start_index = 0 for tchunk in Trajectory.enum_chunks_from_lhdf(project.traj_filename(i), ChunkSize=chunk_size): ptchunk = metric.prepare_trajectory(tchunk) this_length = len(ptchunk) distances = np.empty(this_length, dtype=np.float32) assignments = np.empty(this_length, dtype=np.int) for j in xrange(this_length): d = metric.one_to_all(ptchunk, pgens, j) ind = np.argmin(d) assignments[j] = ind distances[j] = d[ind] end_index = start_index+this_length fh_a.root.arr_0[i, start_index:end_index] = assignments fh_d.root.arr_0[i, start_index:end_index] = distances # i'm not sure exactly what the optimal flush frequency is fh_a.flush() fh_d.flush() start_index = end_index # we're going to keep duplicates of this record -- i.e. writing # it to both files # completed chunks are not checkpointed -- only completed trajectories # this means that if the process dies after completing 10/20 of the # chunks in trajectory i -- those chunks are going to have to be recomputed # (put trajectory i-1 is saved) # this could be changed, but the implementation is a little tricky -- you # have to watch out for the fact that the person might call this function # with chunk_size=N, let it run for a while, kill it, and then call it # again with chunk_size != N. Dealing with that appropriately is tricky # since the chunks wont line up in the two cases fh_a.root.completed_trajs[i] = True fh_d.root.completed_trajs[i] = True fh_a.close() fh_d.close()
CMs = None if Ass.max() < 250: pp = PdfPages(args.out_plot) else: pp = None chunk_size = 10000 if CMs == None: for traj_ind in xrange(Ass.shape[0]): logger.info("Working on %s" % Proj.traj_filename(traj_ind)) for chunk_ind, trj_chunk in enumerate( Trajectory.enum_chunks_from_lhdf( Proj.traj_filename(traj_ind), ChunkSize=chunk_size, AtomIndices=atom_indices ) ): logger.debug("chunked") ptrj_chunk = HB.prepare_trajectory(trj_chunk).astype(float) ass_chunk = Ass[traj_ind][ chunk_ind * chunk_size : (chunk_ind + 1) * chunk_size ] # this behaves as you want at the end of the array for i, ass in enumerate(ass_chunk): if ass == -1: continue CMs_1d[ass] += ptrj_chunk[i] # StateAssigns = np.array([ np.where( Ass == i )[0].shape[0] for i in np.unique( Ass[ np.where( Ass >= 0 ) ] )] ) StateAssigns = np.bincount(Ass[np.where(Ass != -1)], minlength=Ass.max() + 1)
def assign_with_checkpoint(metric, project, generators, assignments_path, distances_path, chunk_size=10000): """ Assign every frame to its closest generator Parameters ---------- metric : msmbuilder.metrics.AbstractDistanceMetric A distance metric used to define "closest" project : msmbuilder.Project Used to load the trajectories generators : msmbuilder.Trajectory A trajectory containing the structures of all of the cluster centers assignments_path : str Path to a file that contains/will contain the assignments, as a 2D array of integers in hdf5 format distances_path : str Path to a file that contains/will contain the assignments, as a 2D array of integers in hdf5 format chunk_size : int The number of frames to load and process per step. The optimal number here depends on your system memory -- it should probably be roughly the number of frames you can fit in memory at any one time. Note, this is only important if your trajectories are long, as the effective chunk_size is really `min(traj_length, chunk_size)` Notes ----- The results will be checkpointed along the way, trajectory by trajectory. So if the process is killed, it should be able to roughly pick up where it left off. """ pgens = metric.prepare_trajectory(generators) # setup the file handles fh_a, fh_d = _setup_containers(project, assignments_path, distances_path) for i in xrange(project.n_trajs): if fh_a.root.completed_trajs[i] and fh_d.root.completed_trajs[i]: logger.info('Skipping trajectory %s -- already assigned', i) continue if fh_a.root.completed_trajs[i] or fh_d.root.completed_trajs[i]: raise RuntimeError('Corruption detected') logger.info('Assigning trajectory %s', i) # pointer to the position in the total trajectory where # the current chunk starts, so we know where in the Assignments # array to put each batch of data start_index = 0 for tchunk in Trajectory.enum_chunks_from_lhdf( project.traj_filename(i), ChunkSize=chunk_size): ptchunk = metric.prepare_trajectory(tchunk) this_length = len(ptchunk) distances = np.empty(this_length, dtype=np.float32) assignments = np.empty(this_length, dtype=np.int) for j in xrange(this_length): d = metric.one_to_all(ptchunk, pgens, j) ind = np.argmin(d) assignments[j] = ind distances[j] = d[ind] end_index = start_index + this_length fh_a.root.arr_0[i, start_index:end_index] = assignments fh_d.root.arr_0[i, start_index:end_index] = distances # i'm not sure exactly what the optimal flush frequency is fh_a.flush() fh_d.flush() start_index = end_index # we're going to keep duplicates of this record -- i.e. writing # it to both files # completed chunks are not checkpointed -- only completed trajectories # this means that if the process dies after completing 10/20 of the # chunks in trajectory i -- those chunks are going to have to be recomputed # (put trajectory i-1 is saved) # this could be changed, but the implementation is a little tricky -- you # have to watch out for the fact that the person might call this function # with chunk_size=N, let it run for a while, kill it, and then call it # again with chunk_size != N. Dealing with that appropriately is tricky # since the chunks wont line up in the two cases fh_a.root.completed_trajs[i] = True fh_d.root.completed_trajs[i] = True fh_a.close() fh_d.close()
def assign_with_checkpoint(metric, project, generators, assignments_path, distances_path, chunk_size=10000, atom_indices_to_load=None): """ Assign every frame to its closest generator The results will be checkpointed along the way, trajectory by trajectory. If the process is killed, it should be able to roughly pick up where it left off. Parameters ---------- metric : msmbuilder.metrics.AbstractDistanceMetric A distance metric used to define "closest" project : msmbuilder.Project Used to load the trajectories generators : msmbuilder.Trajectory A trajectory containing the structures of all of the cluster centers assignments_path : str Path to a file that contains/will contain the assignments, as a 2D array of integers in hdf5 format distances_path : str Path to a file that contains/will contain the assignments, as a 2D array of integers in hdf5 format chunk_size : int The number of frames to load and process per step. The optimal number here depends on your system memory -- it should probably be roughly the number of frames you can fit in memory at any one time. Note, this is only important if your trajectories are long, as the effective chunk_size is really `min(traj_length, chunk_size)` atom_indices_to_load : {None, list} The indices of the atoms to load for each trajectory chunk. Note that this method is responsible for loading up atoms from the project, but does NOT load up the generators. Those are passed in as a trajectory object (above). So if the generators are already subsampled to a restricted set of atom indices, but the trajectories on disk are NOT, you'll need to pass in a set of indices here to resolve the difference. See Also -------- assign_in_memory """ pgens = metric.prepare_trajectory(generators) # setup the file handles fh_a, fh_d = _setup_containers(project, assignments_path, distances_path) for i in xrange(project.n_trajs): if fh_a.root.completed_trajs[i] and fh_d.root.completed_trajs[i]: logger.info('Skipping trajectory %s -- already assigned', i) continue if fh_a.root.completed_trajs[i] or fh_d.root.completed_trajs[i]: logger.warn("Re-assigning trajectory even though some data is" " available...") fh_a.root.completed_trajs[i] = False fh_d.root.completed_trajs[i] = False logger.info('Assigning trajectory %s', i) # pointer to the position in the total trajectory where # the current chunk starts, so we know where in the Assignments # array to put each batch of data start_index = 0 filename = project.traj_filename(i) chunkiter = Trajectory.enum_chunks_from_lhdf(filename, ChunkSize=chunk_size, AtomIndices=atom_indices_to_load) for tchunk in chunkiter: if tchunk['XYZList'].shape[1] != generators['XYZList'].shape[1]: msg = ("Number of atoms in generators does not match " "traj we're trying to assign! Maybe check atom indices?") raise ValueError(msg) ptchunk = metric.prepare_trajectory(tchunk) this_length = len(ptchunk) distances = np.empty(this_length, dtype=np.float32) assignments = np.empty(this_length, dtype=np.int) for j in xrange(this_length): d = metric.one_to_all(ptchunk, pgens, j) ind = np.argmin(d) assignments[j] = ind distances[j] = d[ind] end_index = start_index+this_length fh_a.root.arr_0[i, start_index:end_index] = assignments fh_d.root.arr_0[i, start_index:end_index] = distances # i'm not sure exactly what the optimal flush frequency is fh_a.flush() fh_d.flush() start_index = end_index # we're going to keep duplicates of this record -- i.e. writing # it to both files # completed chunks are not checkpointed -- only completed trajectories # this means that if the process dies after completing 10/20 of the # chunks in trajectory i -- those chunks are going to have to be recomputed # (put trajectory i-1 is saved) # this could be changed, but the implementation is a little tricky -- you # have to watch out for the fact that the person might call this function # with chunk_size=N, let it run for a while, kill it, and then call it # again with chunk_size != N. Dealing with that appropriately is tricky # since the chunks wont line up in the two cases fh_a.root.completed_trajs[i] = True fh_d.root.completed_trajs[i] = True fh_a.close() fh_d.close()