def test_HDF5DatasetLoader_1(): from mdtraj import io assert HDF5DatasetLoader.short_name == 'hdf5' cwd = os.path.abspath(os.curdir) dirname = tempfile.mkdtemp() try: os.chdir(dirname) # one file io.saveh('f1.h5', **{'test': np.zeros((10, 3))}) loader = HDF5DatasetLoader('f1.h5', concat=False) X, y = loader.load() assert np.all(X == np.zeros((10, 3))) assert y is None # two files io.saveh('f2.h5', **{'test': np.ones((10, 3))}) loader = HDF5DatasetLoader('f*.h5', concat=False) X, y = loader.load() assert isinstance(X, list) assert np.all(X[0] == np.zeros((10, 3))) assert np.all(X[1] == np.ones((10, 3))) assert y is None # concat and stride and y_col loader = HDF5DatasetLoader('f*.h5', y_col=2, stride=2, concat=True) X, y = loader.load() assert X.shape[0] == 10 and X.shape[1] == 2 assert y.shape[0] == 10 finally: os.chdir(cwd) shutil.rmtree(dirname)
def save(self, filename): """ Save the results and everything needed to use this object again. Parameters ---------- filename : str filename to save the data to. Will use mdtraj.io.saveh Returns ------- filename : str the same filename in case you want it back. """ kwargs = {} kwargs['regularizer'] = np.array([pickle.dumps(self.regularizer)]) kwargs['eta'] = np.array([self.eta]) print 'has_solution?', self._has_solution if self._has_solution: kwargs['sol'] = self.v io.saveh(filename, **kwargs) return filename
def files_to_shotset(cls, list_of_cbf_files, shotset_filename=None, autocenter=True): """ Convert a bunch of CBF files to a single ODIN shotset instance. If you write the shotset immediately to disk, does this in a smart "lazy" way so as to preseve memory. Parameters ---------- list_of_cbf_files : list of str A list of paths to CBF files to convert. Optional Parameters ------------------- shotset_filename : str The filename of the shotset to write to disk. autocenter : bool Whether or not to automatically determine the center of the detector. Returns ------- ss : odin.xray.Shotset If `shotset_filename` is None, then returns the shotset object """ # convert one CBF, and use it to get the detector, etc info seed_shot = cls(list_of_cbf_files[0], autocenter=autocenter).as_shotset() if shotset_filename: logger.info('writing CBF files straight to disk at: %s' % shotset_filename) seed_shot.save(shotset_filename) # now open a handle to that h5 file and add to it for i,fn in enumerate(list_of_cbf_files[1:]): # i+1 b/c we already saved one shot d = {('shot%d' % (i+1,)) : cls(fn, autocenter=False).intensities.flatten()} io.saveh( shotset_filename, **d ) io.saveh( shotset_filename, num_shots=np.array([ len(list_of_cbf_files) ]) ) logger.info('Combined CBF data into: %s' % shotset_filename) return else: shot_i = np.zeros(( len(list_of_cbf_files), seed_shot.intensities.shape[1] )) shot_i[0,:] = seed_shot.intensities.flatten() for i,fn in enumerate(list_of_cbf_files[1:]): x = cls(fn, autocenter=False).intensities.flatten() if not len(x) == shot_i.shape[1]: raise ValueError('Variable number of pixels in shots!') shot_i[i+1,:] = x ss = xray.Shotset( shot_i, seed_shot.detector, seed_shot.mask ) return ss
def test_RaggedArray_load_specific_h5_arrays(self): src = np.array(range(55)) a = ra.RaggedArray(array=src, lengths=[25, 30]) with tempfile.NamedTemporaryFile(suffix='.h5') as f: io.saveh(f.name, key0=a[0], key1=a[1]) b = ra.load(f.name, keys=['key1']) assert_array_equal(a[1], b[0])
def project(n_parms, tica_evs, tica_lag): ref1 = np.loadtxt('selected_frames/%s.txt' % parms[0]) d = np.zeros((len(ref1), n_parms)) for p in range(n_parms): data = np.loadtxt('selected_frames/%s.txt' % (parms[p])) d[:, p] = data proj = np.dot(d, tica_evs.T) io.saveh('selected_frames/selected_frames_on_tica_l%d.h5' % (tica_lag), proj) return proj
def save(self, output_fn): """ save results to a .h5 file """ kernel_str = pickle.dumps(self.kernel) io.saveh(output_fn, vals=self.vals, betas=self.betas, K=self.K, Ku=self.Ku, eta=np.array([self.eta]), Xtrain=self._Xtrain, dt=np.array([self.dt]), kernel_str=np.array([kernel_str]))
def save_maps(rs_map, savepath): """ Save maps as separate keys in .h5 file format so that downstream loading isn't problematic. """ for i, item in enumerate(rs_map): name = 'n%s' % i data = {name : item} io.saveh(savepath + ".h5", **data) return
def entry_point(): args = parser.parse_args() arglib.die_if_path_exists(args.output) if args.atom_indices.lower() == 'all': atom_indices = None else: atom_indices = np.loadtxt(args.atom_indices).astype(int) project = Project.load_from(args.project) SASA = run(project, atom_indices, args.traj_fn) io.saveh(args.output, SASA)
def save(self, output_fn): """ save results to a .h5 file """ kernel_str = pickle.dumps(self.kernel) io.saveh(output_fn, vals=self.vals, vecs=self.vecs, K=self.K, K_uncentered=self.K_uncentered, reg_factor=np.array([self.reg_factor]), M=self.M, a=self.a, a_mean=self.a_mean, a_stdev=self.a_stdev, kernel_str=np.array([kernel_str]))
def test_overwrite_2(): fid, fn = tempfile.mkstemp() try: a = np.arange(10) b = a + 1 io.saveh(fn, a=a) io.saveh(fn, a=b) eq(io.loadh(fn, 'a'), b) except: raise finally: if os.path.exists(fn): os.close(fid) os.unlink(fn)
def save(self, output_fn): """ save results to a .h5 file """ kernel_str = pickle.dumps(self.kernel) io.saveh(output_fn, vals=self.vals, vecs=self.vecs, K=self.K, K_uncentered=self.K_uncentered, reg_factor=np.array([self.reg_factor]), traj=self._Xall, dt=np.array([self.dt]), normalized=np.array([self._normalized]), kernel_str=np.array([kernel_str]))
def save(self, output): """ save the results to file Parameters: ----------- output : str output filename (.h5) """ metric_string = cPickle.dumps(self.prep_metric) # Serialize metric used to calculate tICA input. io.saveh(output, timelag_corr_mat=self.timelag_corr_mat, cov_mat=self.cov_mat, lag=np.array([self.lag]), vals=self.vals, vecs=self.vecs, metric_string=np.array([metric_string]))
def project(start_traj,end_traj,n_parms,tica,tica_lag): dataset = [] if not os.path.exists('analysis/tica_projections') : os.system('mkdir analysis/tica_projections') for i in range(start_traj,end_traj+1): ref1 = np.loadtxt('analysis/%d/analysis/parameters/%s.txt' %(i,parms[0])) d = np.zeros((len(ref1),n_parms)) for p in range(n_parms): data = np.loadtxt('analysis/%d/analysis/parameters/%s.txt' %(i,parms[p])) d[:,p] = data proj = np.dot(d,tica['components'].T) io.saveh('analysis/tica_projections/traj%d_on_tica_l%d.h5' %(i,tica_lag), proj) print "\tsaved projected trajectory %d at folder 'analysis/tica_projections' " %i dataset.append(proj) return dataset
def entry_point(): args = parser.parse_args() k = int(args.num_states) if args.num_states != 'none' else None d = float(args.cutoff_distance) if args.cutoff_distance != 'none' else None arglib.die_if_path_exists(args.assignments) if k is None and d is None: logger.error( 'You need to supply either a number of states or a cutoff distance') sys.exit(1) project = Project.load_from(args.project) assignments = main( k, d, args.hierarchical_clustering_zmatrix, args.stride, project) io.saveh(args.assignments, assignments) logger.info('Saved assignments to %s', args.assignments)
def entry_point(): args, metric = parser.parse_args() arglib.die_if_path_exists(args.output) project = Project.load_from(args.project) pdb = md.load(args.pdb) if args.traj_fn.lower() == 'all': traj_fn = None else: traj_fn = args.traj_fn distances = run(project, pdb, metric, traj_fn) io.saveh(args.output, distances) logger.info('Saved to %s', args.output)
def entry_point(): args = parser.parse_args() k = int(args.num_states) if args.num_states != 'none' else None d = float(args.cutoff_distance) if args.cutoff_distance != 'none' else None arglib.die_if_path_exists(args.assignments) if k is None and d is None: logger.error( 'You need to supply either a number of states or a cutoff distance' ) sys.exit(1) project = Project.load_from(args.project) assignments = main(k, d, args.hierarchical_clustering_zmatrix, args.stride, project) io.saveh(args.assignments, assignments) logger.info('Saved assignments to %s', args.assignments)
def save_to_disk(self, filename): """Save this clusterer to disk. This is useful because computing the Z-matrix (done in __init__) is the most expensive part, and assigning is cheap Parameters ---------- filename : str location to save to Raises ------ Exception if something already exists at `filename` """ io.saveh(filename, z_matrix=self.Z, traj_lengths=self.traj_lengths)
def entry_point(): args = parser.parse_args() arglib.die_if_path_exists(args.output) try: assignments = io.loadh(args.assignments, 'arr_0') distances = io.loadh(args.distances, 'arr_0') except KeyError: assignments = io.loadh(args.assignments, 'Data') distances = io.loadh(args.distances, 'Data') trimmed = run(assignments, distances, args.rmsd_cutoff) io.saveh(args.output, trimmed) logger.info('Saved output to %s', args.output)
def save(self, output): """ save the results to file Parameters: ----------- output : str output filename (.h5) """ # Serialize metric used to calculate tICA input. metric_string = cPickle.dumps(self.prep_metric) io.saveh(output, timelag_corr_mat=self.timelag_corr_mat, cov_mat=self.cov_mat, lag=np.array([self.lag]), vals=self.vals, vecs=self.vecs, metric_string=np.array([metric_string]))
def _save_masks(self, rs_mask): """ Save mask in h5 format, with each key corresponding to a separate image. Currently stored in a temporary directory since arrays for the same image from different batches must still be compiled. """ output_dir = self.system['map_path'] + "temp/" if not os.path.exists(output_dir): os.makedirs(output_dir) for i, item in enumerate(rs_mask): name = 'arr_%s' % i data = {name : item} io.saveh(output_dir + "masks_b%s.h5" %self.nbatch, **data) return
def setup(self): self.q_values = np.array([1.0, 2.0]) self.num_phi = 360 self.l = 50.0 self.d = xray.Detector.generic(spacing=0.4, l=self.l) self.t = trajectory.load(ref_file('ala2.pdb')) self.num_shots = 2 intensities = np.abs(np.random.randn(self.num_shots, self.d.num_pixels)) io.saveh('tmp_tables.h5', data=intensities) self.tables_file = tables.File('tmp_tables.h5') self.i = self.tables_file.root.data self.shot = xray.Shotset(self.i, self.d) return
def setup(self): self.q_values = np.array([1.0, 2.0]) self.num_phi = 360 self.l = 50.0 self.d = xray.Detector.generic(spacing=0.4, l=self.l) self.t = Trajectory.load(ref_file('ala2.pdb')) self.num_shots = 2 intensities = np.abs(np.random.randn(self.num_shots, self.d.num_pixels)) io.saveh('tmp_tables.h5', data=intensities) self.tables_file = tables.File('tmp_tables.h5') self.i = self.tables_file.root.data self.shot = xray.Shotset(self.i, self.d) return
def start(self): import pickle from mdtraj import io from glob import glob import numpy as np featurizer = np.load(self.args.featurizer) topology = featurizer.reference_traj filenames = [fn for t in self.args.trajectories for fn in glob(t)] X, indices, fns = featurize_all(filenames, featurizer, topology) y = self.model.fit_transform(X) io.saveh( self.args.out, X=y, indices=indices, fns=fns, labels=np.array(self.labels), featurizer=np.array([pickle.dumps(featurizer)])) print('Projection saved: %s' % self.args.out)
def save(self, output_name, txt=False, txt_fmt='%d %d %d %f'): if txt: x1_coords_flat = self.x1_coords.flatten() x2_coords_flat = self.x2_coords.flatten() values_flat = self.values.flatten() states = np.arange(len(x1_coords_flat)) output_data = np.array( list(zip(states, x1_coords_flat, x2_coords_flat, values_flat))) np.savetxt(output_name, output_data, fmt=txt_fmt, header='state x1 x2 energy') else: output_dict = { 'x1_coords': self.x1_coords, 'x2_coords': self.x2_coords, 'landscape': self.values } io.saveh(output_name, **output_dict)
def test_RaggedArray_load_h5_arrays(self): src = np.array(range(55)) a = ra.RaggedArray(array=src, lengths=[25, 30]) with tempfile.NamedTemporaryFile(suffix='.h5') as f: io.saveh(f.name, key0=a[0], key1=a[1]) b = ra.load(f.name, keys=['key0', 'key1']) assert_ra_equal(a, b) src = np.array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]]).T a = ra.RaggedArray(array=src, lengths=[4, 6]) with tempfile.NamedTemporaryFile(suffix='.h5') as f: io.saveh(f.name, key0=a[0], key1=a[1]) b = ra.load(f.name, keys=['key0', 'key1']) assert_ra_equal(a, b)
def start(self): import pickle import mdtraj as md from mdtraj import io from glob import glob import numpy as np featurizer = np.load(self.args.featurizer) topology = md.load(self.args.top) filenames = [fn for t in self.args.trajectories for fn in glob(t)] X, indices, fns = featurize_all(filenames, featurizer, topology) y = self.model.fit_transform([X]) fns = np.array([fn.encode('utf-8') for fn in fns]) io.saveh( self.args.out, X=y[0], indices=indices, fns=fns, labels=np.array(self.labels), topology = np.array([pickle.dumps(topology)]), featurizer=np.array([pickle.dumps(featurizer)])) print('Projection saved: %s' % self.args.out)
def save(output_name, ragged_array): """Save a RaggedArray or numpy ndarray to disk as an HDF5 file. Parameters ---------- output_name : str Path of file to write out. ragged_array : np.ndarray, RaggedArray Array to write to disk. See Also -------- mdtraj.io.saveh """ try: io.saveh(output_name, array=ragged_array._data, lengths=ragged_array.lengths) except AttributeError: # A TypeError results when the input is actually an ndarray io.saveh(output_name, ragged_array)
def _save_old_style(output_name, ragged_array): """Depricated en bloc RaggedArray saving routine. Parameters ---------- output_name : str Path of file to write out. ragged_array : np.ndarray, RaggedArray Array to write to disk. See Also -------- mdtraj.io.saveh """ try: io.saveh(output_name, array=ragged_array._data, lengths=ragged_array.lengths) except AttributeError: # A TypeError results when the input is actually an ndarray io.saveh(output_name, ragged_array)
def compile_masks(system): """ Compile per-batch Bragg masks into a composite file. """ #n_images = system['batch_size'] * system['n_batch'] n_images = len(system['img2batch']) dtc_size = system['shape'][0] * system['shape'][1] comb_mask = np.zeros((n_images, dtc_size), dtype=np.uint8) # combine all temp files for batch in range(int(system['n_batch'])): print "on batch %i" %batch for img in range(int(n_images)): comb_mask[img] += io.loadh(system['map_path'] + "temp/masks_b%s.h5" %batch, "arr_%i" %img) print "saving combined mask" for i, item in enumerate(comb_mask): name = 'arr_%s' % i data = {name : item} io.saveh(system['map_path'] + "combined_braggmasks.h5", **data) return
def start(self): import pickle import mdtraj as md from mdtraj import io from glob import glob import numpy as np featurizer = np.load(self.args.featurizer) topology = md.load(self.args.top) filenames = [fn for t in self.args.trajectories for fn in glob(t)] X, indices, fns = featurize_all(filenames, featurizer, topology) y = self.model.fit_transform([X]) fns = np.array([fn.encode('utf-8') for fn in fns]) io.saveh(self.args.out, X=y[0], indices=indices, fns=fns, labels=np.array(self.labels), topology=np.array([pickle.dumps(topology)]), featurizer=np.array([pickle.dumps(featurizer)])) print('Projection saved: %s' % self.args.out)
def save(outdir, traj_lengths, stride, n_real_atoms, centers, assignments, distances, scores, times): assignments = reshape_for_output(assignments, np.int, traj_lengths, stride) distances = reshape_for_output(distances, np.float, traj_lengths, stride) centers = centers.swapaxes(1,2) centers = centers[:, 0:n_real_atoms, :] os.makedirs(outdir) log('saving results to %s/' % outdir) io.saveh(os.path.join(outdir, 'centers.h5'), XYZList=centers) io.saveh(os.path.join(outdir, 'Assignments.h5'), assignments) io.saveh(os.path.join(outdir, 'Assignments.h5.distances'), distances) if len(scores) > 0 and len(times) > 0: io.saveh(os.path.join(outdir, 'convergence.h5'), scores=scores, times=times)
def save_container(filename, dtype): io.saveh( filename, arr_0=-1 * np.ones( (project.n_trajs, np.max(project.traj_lengths)), dtype=dtype), completed_trajs=np.zeros((project.n_trajs), dtype=np.bool))
############################################################################## # Code ############################################################################## def main(k, d, zmatrix_fn, stride, project): hierarchical = Hierarchical.load_from_disk(zmatrix_fn) assignments = hierarchical.get_assignments(k=k, cutoff_distance=d) new_assignments = np.ones( (project.n_trajs, project.traj_lengths.max()), dtype=np.int) * -1 new_assignments[:, ::stride] = assignments return new_assignments if __name__ == "__main__": args = parser.parse_args() k = int(args.num_states) if args.num_states != 'none' else None d = float(args.cutoff_distance) if args.cutoff_distance != 'none' else None arglib.die_if_path_exists(args.assignments) if k is None and d is None: logger.error( 'You need to supply either a number of states or a cutoff distance') sys.exit(1) project = Project.load_from(args.project) assignments = main( k, d, args.hierarchical_clustering_zmatrix, args.stride, project) io.saveh(args.assignments, assignments) logger.info('Saved assignments to %s', args.assignments)
def main(args, metric): if args.alg == 'sclarans' and args.stride != 1: logger.error( """You don't want to use a stride with sclarans. The whole point of sclarans is to use a shrink multiple to accomplish the same purpose, but in parallel with stochastic subsampling. If you cant fit all your frames into memory at the same time, maybe you could stride a little at the begining, but its not recommended.""") sys.exit(1) # if we have a metric that explicitly operates on a subset of indices, # then we provide the option to only load those indices into memory # WARNING: I also do something a bit dirty, and inject `None` for the # RMSD.atomindices to get the metric to not splice if isinstance(metric, metrics.RMSD): atom_indices = metric.atomindices metric.atomindices = None # probably bad... logger.info('RMSD metric - loading only the atom indices required') else: atom_indices = None # In case the clustering / algorithm needs extra arguments, use # this dictionary extra_kwargs = {} # Check to be sure we won't overwrite any data if args.alg == 'hierarchical': zmatrix_fn = os.path.join(args.output_dir, 'ZMatrix.h5') die_if_path_exists(zmatrix_fn) extra_kwargs['zmatrix_fn'] = zmatrix_fn else: generators_fn = os.path.join(args.output_dir, 'Gens.h5') die_if_path_exists(generators_fn) if args.stride == 1: assignments_fn = os.path.join(args.output_dir, 'Assignments.h5') distances_fn = os.path.join(args.output_dir, 'Assignments.h5.distances') die_if_path_exists([assignments_fn, distances_fn]) project = Project.load_from(args.project) if isinstance(metric, metrics.Vectorized) and not args.alg == 'hierarchical': # if the metric is vectorized then # we can load prepared trajectories # which may allow for better memory # efficiency ptrajs, which = load_prep_trajectories(project, args.stride, atom_indices, metric) trajectories = None n_trajs = len(ptrajs) num_frames = np.sum([len(p) for p in ptrajs]) if num_frames != len(which): raise Exception("something went wrong in loading step (%d v %d)" % (num_frames, len(which))) else: trajectories = load_trajectories(project, args.stride, atom_indices) ptrajs = None which = None n_trajs = len(trajectories) logger.info('Loaded %d trajs', n_trajs) clusterer = cluster(metric, trajectories, ptrajs, args, **extra_kwargs) if not isinstance(clusterer, clustering.Hierarchical): if isinstance(metric, metrics.Vectorized): gen_inds = clusterer.get_generator_indices() generators = project.load_frame(which[gen_inds, 0], which[gen_inds, 1]) else: generators = clusterer.get_generators_as_traj() logger.info('Saving %s', generators_fn) generators.save(generators_fn) if args.stride == 1: assignments = clusterer.get_assignments() distances = clusterer.get_distances() logger.info('Since stride=1, Saving %s', assignments_fn) logger.info('Since stride=1, Saving %s', distances_fn) io.saveh(assignments_fn, assignments) io.saveh(distances_fn, distances)
def compute_crysol(trajectory, save_to=None): """ Compute crysol for all the snapshots in an msmbuilder trajectory. Parameters ---------- trajectory : msmbulder.Trajectory.trajectory The trajectory to compute SAXS for save_to : str or None If this is a string, will save to an h5 file of that name. Returns ------- q_values : np.ndarray The q_values at which the scattering was computed, in () scattering_pred : np.ndarray The estimated integrated intensity for each `q_value` """ setup_tmp_dir() if type(trajectory) == str: trajectory = Trajectory.load_trajectory_file(trajectory) os.chdir(TEMPDIR) scattering_pred = None for i in range(len(trajectory)): frame = trajectory[i] pdbfn = '%s/tmp4crysol.pdb' % TEMPDIR frame.save_to_pdb(pdbfn) # run crysol comand line args = ['/%s %s' % kv for kv in crysol_params.items()] cmd = ['crysol', pdbfn] + args print cmd subprocess.check_call(' '.join(cmd), shell=True, stdout=DEVNULL, stderr=DEVNULL) # parse the output intensities_output = 'tmp4crysol00.int' if not os.path.exists(intensities_output): raise IOError('crysol output not found') d = np.genfromtxt(intensities_output, skip_header=1) q_values = d[:,0] # initialize output space if scattering_pred == None: scattering_pred = np.zeros((len(trajectory), d.shape[0])) scattering_pred[i,:] = d[:,3] os.remove(pdbfn) os.remove(intensities_output) os.remove('tmp4crysol00.alm') os.remove('tmp4crysol00.log') if save_to: io.saveh(save_to, q_values=q_values, saxs=scattering_pred) print "Saved: %s" % save_to return else: return q_values, scattering_pred
def ConvertDihedralsToArray(phi, psi): HCarray = np.zeros((phi.shape)) for i in range(len(phi)): for j in range(len(phi[i])): if is_helical_peptide(phi[i, j], psi[i, j]): HCarray[i][j] = 1 else: HCarray[i][j] = 0 return HCarray def count_n_helices(HCarray): return HCarray.sum(1) project = Project.load_from(args.Project) assignments = -1 * np.ones((project.n_trajs, max(project.traj_lengths))) for trajid in range(project.n_trajs): print "Working on: %s" % project.traj_filename(trajid) traj = project.load_traj(trajid) phi = mdtraj.compute_phi(traj)[1] * 360 / (2 * np.pi) psi = mdtraj.compute_psi(traj)[1] * 360 / (2 * np.pi) HCarray = ConvertDihedralsToArray(phi, psi) assignments[trajid][:traj.n_frames] = count_n_helices(HCarray) assignments = assignments.astype(int) io.saveh(args.Output, assignments)
for i in range(k): transmat[i, :] = countsmat[i, :] / np.sum(countsmat[i, :]) eigs = np.sort(np.real(np.linalg.eigvals(transmat))) timescales = -lag_time / np.log(eigs) return timescales trajectories = [] for i in range(10): fn = 'trajectory-%d.h5' % i if os.path.exists(fn): print 'loading %s' % fn trajectories.append(io.loadh(fn)['arr_0']) else: x = propagate(5e5) io.saveh(fn, x) print 'saving %s' % fn trajectories.append(x) def msm_timescale(trajectories, lag_times, n_states=2, discretization='grid'): all_timescales = np.zeros((len(trajectories), len(lag_times))) for i, x in enumerate(trajectories): all_timescales[i] = [msm_solution(x, n_states, lag_time, discretization=discretization)[-2] for lag_time in lag_times] return np.mean(all_timescales, axis=0), np.std(all_timescales, axis=0) / np.sqrt(len(trajectories)) def ghmm_timescale(trajectories, lag_times, n_states=2): all_timescales = np.zeros((len(trajectories), len(lag_times))) for i, x in enumerate(trajectories): all_timescales[i] = [GaussianFusionHMM(n_states, n_features=1, fusion_prior=0).fit([x[::l].reshape(-1,1)]).timescales_()[-1]*l for l in lag_times] return np.mean(all_timescales, axis=0), np.std(all_timescales, axis=0) / np.sqrt(len(trajectories))
parser.add_argument('rmsd_cutoff', help="""distance value at which to trim, in. Data further than this value to its generator will be discarded. Note: this is measured with whatever distance metric you used to cluster""", type=float) parser.add_argument('output', default='Data/Assignments.Trimmed.h5') def run(assignments, distances, cutoff): number = np.count_nonzero(distances > cutoff) logger.info('Discarding %d assignments', number) assignments[np.where(distances > cutoff)] = -1 return assignments if __name__ == "__main__": args = parser.parse_args() arglib.die_if_path_exists(args.output) try: assignments = io.loadh(args.assignments, 'arr_0') distances = io.loadh(args.distances, 'arr_0') except KeyError: assignments = io.loadh(args.assignments, 'Data') distances = io.loadh(args.distances, 'Data') trimmed = run(assignments, distances, args.rmsd_cutoff) io.saveh(args.output, trimmed) logger.info('Saved output to %s', args.output)
traj_asa = [] logger.info("Working on Trajectory %d", traj_ind) traj_fn = project.traj_filename(traj_ind) chunk_ind = 0 for traj_chunk in md.iterload(traj_fn, atom_indices=atom_indices, chunk=1000): traj_asa.extend(md.shrake_rupley(traj_chunk)) chunk_ind += 1 SASA[traj_ind, 0:project.traj_lengths[traj_ind]] = traj_asa else: traj_asa = [] for traj_chunk in Trajectory.enum_chunks_from_lhdf( traj_fn, AtomIndices=atom_indices ): traj_asa.extend( asa.calculate_asa( traj_chunk ) ) SASA = np.array(traj_asa) return SASA if __name__ == '__main__': args = parser.parse_args() arglib.die_if_path_exists(args.output) if args.atom_indices.lower() == 'all': atom_indices = None else: atom_indices = np.loadtxt(args.atom_indices).astype(int) project = Project.load_from(args.project) SASA = run(project, atom_indices, args.traj_fn) io.saveh(args.output, SASA)
pairs = [] for i in range(len(listt)): for j in range(i + 1, len(listt)): pairs.append([listt[i], listt[j]]) pairs = np.array(pairs) print "len(pairs):", len(pairs) # stage 1 for i in range(4): traj = md.load('../md_files/stage1_xtc/protein_%d.xtc' % i, top=ref) print "stage1: traj, xyz.shape:", i, traj.xyz.shape d = md.compute_contacts(traj, contacts=pairs, scheme='closest-heavy', ignore_nonprotein=True) io.saveh('s1_%d.h5' % i, distances=d[0]) io.saveh('s1_%d.h5' % i, residue_pairs=d[1]) # stage 2 for i in range(20): traj = md.load('../md_files/stage2_xtc/protein_%d.xtc' % i, top=ref) print "stage2: traj, xyz.shape:", i, traj.xyz.shape d = md.compute_contacts(traj, contacts=pairs, scheme='closest-heavy', ignore_nonprotein=True) io.saveh('s1_%d.h5' % i, distances=d[0]) io.saveh('s1_%d.h5' % i, residue_pairs=d[1]) # stage 3 for i in range(20):
cc_aniso[counter] = model_utils.mweighted_cc(pred_aniso.copy(), exp_map.copy(), mult = mult) print "gamma: %.2f, sigma: %.2f, CC: %.4f" %(gamma, sigma, cc_aniso[counter]) counter += 1 cc_aniso = cc_aniso.reshape(len(gamma_range), len(sigma_range)) return cc_aniso if __name__ == '__main__': start = time.time() #sigma_range, gamma_range = np.arange(0.05, 1.55, 0.05), np.arange(3.0, 93.0, 3.0) sigma_range, gamma_range = np.arange(0.5, 0.61, 0.01), np.arange(12.0, 21.0) # load system and generate symmetry information system = pickle.load(open(sys.argv[1], "rb")) symm_ops = pickle.load(open("reference/symm_ops.pickle", "rb"))[sys.argv[2]] symm_idx, grid, mult = model_utils.generate_symmates(symm_ops, system, laue=False) # load molecular transform and experimental maps transform = np.load(sys.argv[3]) experimental = np.load(sys.argv[4]) # scan across sigma and gamma ranges; save mesh and cc_aniso to same .h5 file cc_aniso = scan(system, transform, experimental, sigma_range, gamma_range, sys.argv[5], mult) io.saveh(sys.argv[6] + "/%s.h5" %(sys.argv[5]), sigmas = sigma_range) io.saveh(sys.argv[6] + "/%s.h5" %(sys.argv[5]), gammas = gamma_range) io.saveh(sys.argv[6] + "/%s.h5" %(sys.argv[5]), cc = cc_aniso) print "elapsed time is %.3f" %((time.time() - start)/60.0)
import os, sys from msmbuilder import Project import mdtraj as md from mdtraj import io import numpy as np project = Project.load_from("ProjectInfo-RRR.yaml") Rgs = -1 * np.ones((project.n_trajs, max(project.traj_lengths))) for i in range(project.n_trajs): t = project.load_traj(i) rg = md.compute_rg(t) Rgs[i][:len(rg)] = rg io.saveh('Rgs-RRR.h5', Rgs)
def files_to_shotset(cls, list_of_cbf_files, shotset_filename=None, autocenter=True): """ Convert a bunch of CBF files to a single ODIN shotset instance. If you write the shotset immediately to disk, does this in a smart "lazy" way so as to preseve memory. Parameters ---------- list_of_cbf_files : list of str A list of paths to CBF files to convert. Optional Parameters ------------------- shotset_filename : str The filename of the shotset to write to disk. autocenter : bool Whether or not to automatically determine the center of the detector. Returns ------- ss : odin.xray.Shotset If `shotset_filename` is None, then returns the shotset object """ # convert one CBF, and use it to get the detector, etc info seed_shot = cls(list_of_cbf_files[0], autocenter=autocenter).as_shotset() if shotset_filename: logger.info('writing CBF files straight to disk at: %s' % shotset_filename) seed_shot.save(shotset_filename) # now open a handle to that h5 file and add to it for i, fn in enumerate(list_of_cbf_files[1:]): # i+1 b/c we already saved one shot d = { ('shot%d' % (i + 1, )): cls(fn, autocenter=False).intensities.flatten() } io.saveh(shotset_filename, **d) io.saveh(shotset_filename, num_shots=np.array([len(list_of_cbf_files)])) logger.info('Combined CBF data into: %s' % shotset_filename) return else: shot_i = np.zeros( (len(list_of_cbf_files), seed_shot.intensities.shape[1])) shot_i[0, :] = seed_shot.intensities.flatten() for i, fn in enumerate(list_of_cbf_files[1:]): x = cls(fn, autocenter=False).intensities.flatten() if not len(x) == shot_i.shape[1]: raise ValueError('Variable number of pixels in shots!') shot_i[i + 1, :] = x ss = xray.Shotset(shot_i, seed_shot.detector, seed_shot.mask) return ss
def main(args, metric): if args.alg == 'sclarans' and args.stride != 1: logger.error("""You don't want to use a stride with sclarans. The whole point of sclarans is to use a shrink multiple to accomplish the same purpose, but in parallel with stochastic subsampling. If you cant fit all your frames into memory at the same time, maybe you could stride a little at the begining, but its not recommended.""") sys.exit(1) # if we have a metric that explicitly operates on a subset of indices, # then we provide the option to only load those indices into memory # WARNING: I also do something a bit dirty, and inject `None` for the # RMSD.atomindices to get the metric to not splice if isinstance(metric, metrics.RMSD): atom_indices = metric.atomindices metric.atomindices = None # probably bad... logger.info('RMSD metric - loading only the atom indices required') else: atom_indices = None # In case the clustering / algorithm needs extra arguments, use # this dictionary extra_kwargs = {} # Check to be sure we won't overwrite any data if args.alg == 'hierarchical': zmatrix_fn = os.path.join(args.output_dir, 'ZMatrix.h5') die_if_path_exists(zmatrix_fn) extra_kwargs['zmatrix_fn'] = zmatrix_fn else: generators_fn = os.path.join(args.output_dir, 'Gens.h5') die_if_path_exists(generators_fn) if args.stride == 1: assignments_fn = os.path.join(args.output_dir, 'Assignments.h5') distances_fn = os.path.join(args.output_dir, 'Assignments.h5.distances') die_if_path_exists([assignments_fn, distances_fn]) project = Project.load_from(args.project) if isinstance(metric, metrics.Vectorized) and not args.alg == 'hierarchical': # if the metric is vectorized then # we can load prepared trajectories # which may allow for better memory # efficiency ptrajs, which = load_prep_trajectories(project, args.stride, atom_indices, metric) trajectories = None n_trajs = len(ptrajs) num_frames = np.sum([len(p) for p in ptrajs]) if num_frames != len(which): raise Exception("something went wrong in loading step (%d v %d)" % (num_frames, len(which))) else: trajectories = load_trajectories(project, args.stride, atom_indices) ptrajs = None which = None n_trajs = len(trajectories) logger.info('Loaded %d trajs', n_trajs) clusterer = cluster(metric, trajectories, ptrajs, args, **extra_kwargs) if not isinstance(clusterer, clustering.Hierarchical): if isinstance(metric, metrics.Vectorized): gen_inds = clusterer.get_generator_indices() generators = project.load_frame(which[gen_inds,0], which[gen_inds,1]) else: generators = clusterer.get_generators_as_traj() logger.info('Saving %s', generators_fn) generators.save(generators_fn) if args.stride == 1: assignments = clusterer.get_assignments() distances = clusterer.get_distances() logger.info('Since stride=1, Saving %s', assignments_fn) logger.info('Since stride=1, Saving %s', distances_fn) io.saveh(assignments_fn, assignments) io.saveh(distances_fn, distances)
def test_save(self): """Save HDF5 to disk and load it back up""" io.saveh(self.filename2, self.data) TestData = io.loadh(self.filename2, 'arr_0') eq(TestData, self.data)
def save_container(filename, dtype): io.saveh(filename, arr_0=-1 * np.ones((project.n_trajs, np.max(project.traj_lengths)), dtype=dtype), completed_trajs=np.zeros((project.n_trajs), dtype=np.bool))
n_parms = len(parms) n_trajs = end_traj - start_traj + 1 print "there are %d parameters" % n_parms print "there are %d trajectories in the 'analysis/parameters' folder" % n_trajs # getting tICA print "Obtaining tICA object..." tica = ti.tICA(n_components=None, lag_time=tica_lag) dataset1 = train(start_traj, end_traj, n_parms) tica.fit(dataset1) print "first 5 tICA eigenvalues:", tica.eigenvalues_[0:5] tica.save('analysis/tica_l%d.h5' % tica_lag) print "saved tICA object: 'tica_l%d.h5' in folder 'analysis' " % tica_lag # projecting and ploting tICA tica = io.loadh('analysis/tica_l%d.h5' % tica_lag) dataset = project(start_traj, end_traj, n_parms, tica, tica_lag) ev0, ev1 = [], [] for i in range(n_trajs): ev0.extend(dataset[i][:, 0]) ev1.extend(dataset[i][:, 1]) ev0, ev1 = np.array(ev0), np.array(ev1) io.saveh('analysis/tica_projections/ev0.h5', ev0) io.saveh('analysis/tica_projections/ev1.h5', ev1) print "saved all projected frames 'ev0.h5 & ev1.h5' at 'analysis/tica_projections' " plt.figure(figsize=(12, 8)) plt.hist2d(ev0, ev1, bins=200, norm=LogNorm(), cmap=plt.cm.jet) plt.savefig('analysis/tica_l%d.pdf' % tica_lag) print "saved tica landscape for lag time %d at 'analysis/tica_l%d.pdf' " % ( tica_lag, tica_lag)
start = time.time() system = pickle.load(open(sys.argv[2], "rb")) if sys.argv[1] == 'compile': # generating temp dir for resolution shell data output_dir = system['map_path'] + "temp/" if not os.path.exists(output_dir): os.makedirs(output_dir) # processing resolution shell I_dir, num_shells, n_shell = sys.argv[3], int(sys.argv[4]), int(sys.argv[5]) shell_maps, shell_grids = compile_shell(system, I_dir, num_shells, n_shell) # save resolution shell in grid and dictionary formats with open(system['map_path'] + "temp/dict_rshell%i_t%i.pickle" %(n_shell, num_shells), "wb") as handle: pickle.dump(shell_maps, handle) io.saveh(system['map_path'] + "temp/grid_rshell%i_t%i.h5" %(n_shell, num_shells), **shell_grids) if sys.argv[1] == "reduce": # combine resolution shells combined_maps, shell_stats = reduce_shells(system, int(sys.argv[3])) io.saveh(system['map_path'] + "final_maps.h5", **combined_maps) io.saveh(system['map_path'] + "shell_statistics.h5", **shell_stats) print "elapsed time is %f" %((time.time() - start)/60.0)