def run_pcca_plus(num_macrostates, assignments, tProb, output_dir, flux_cutoff=0.0, objective_function="crispness", do_minimization=True): MacroAssignmentsFn = os.path.join(output_dir, "MacroAssignments.h5") MacroMapFn = os.path.join(output_dir, "MacroMapping.dat") ChiFn = os.path.join(output_dir, 'Chi.dat') AFn = os.path.join(output_dir, 'A.dat') arglib.die_if_path_exists([MacroAssignmentsFn, MacroMapFn, ChiFn, AFn]) logger.info("Running PCCA+...") A, chi, vr, MAP = lumping.pcca_plus(tProb, num_macrostates, flux_cutoff=flux_cutoff, do_minimization=do_minimization, objective_function=objective_function) MSMLib.apply_mapping_to_assignments(assignments, MAP) np.savetxt(ChiFn, chi) np.savetxt(AFn, A) np.savetxt(MacroMapFn, MAP, "%d") msmbuilder.io.saveh(MacroAssignmentsFn, assignments) logger.info('Saved output to: %s, %s, %s, %s', ChiFn, AFn, MacroMapFn, MacroAssignmentsFn)
def entry_point(): args = parser.parse_args() T = scipy.io.mmread(args.tProb) U = np.loadtxt(args.starting).astype(int) F = np.loadtxt(args.ending).astype(int) # deal with case where have single start or end state # TJL note: This should be done in the library now... but leaving it if U.shape == (): tmp = np.zeros(1, dtype=int) tmp[0] = int(U) U = tmp.copy() if F.shape == (): tmp = np.zeros(1, dtype=int) tmp[0] = int(F) F = tmp.copy() # Check output isn't taken output_list = ["committors.dat", "net_flux.mtx"] output_flist = [os.path.join(args.output_dir, f) for f in output_list] arglib.die_if_path_exists(output_flist) Fc, NFlux = run(T, U, F) np.savetxt(output_flist[0], Fc) scipy.io.mmwrite(output_flist[1], NFlux) logger.info("Saved output to %s", ', '.join(output_flist))
def run(MinLagtime, MaxLagtime, Interval, NumEigen, AssignmentsFn, symmetrize, nProc, output): arglib.die_if_path_exists(output) # Setup some model parameters try: Assignments = io.loadh(AssignmentsFn, 'arr_0') except KeyError: Assignments = io.loadh(AssignmentsFn, 'Data') NumStates = max(Assignments.flatten()) + 1 if NumStates <= NumEigen - 1: NumEigen = NumStates - 2 logger.warning( "Number of requested eigenvalues exceeds the rank of the transition matrix! Defaulting to the maximum possible number of eigenvalues." ) del Assignments logger.info("Getting %d eigenvalues (timescales) for each lagtime...", NumEigen) lagTimes = range(MinLagtime, MaxLagtime + 1, Interval) logger.info("Building MSMs at the following lag times: %s", lagTimes) # Get the implied timescales (eigenvalues) impTimes = msm_analysis.get_implied_timescales(AssignmentsFn, lagTimes, n_implied_times=NumEigen, sliding_window=True, symmetrize=symmetrize, n_procs=nProc) numpy.savetxt(output, impTimes) return
def run(MinLagtime, MaxLagtime, Interval, NumEigen, AssignmentsFn, symmetrize, nProc, output): arglib.die_if_path_exists(output) # Setup some model parameters try: Assignments = io.loadh(AssignmentsFn, "arr_0") except KeyError: Assignments = io.loadh(AssignmentsFn, "Data") NumStates = max(Assignments.flatten()) + 1 if NumStates <= NumEigen - 1: NumEigen = NumStates - 2 logger.warning( "Number of requested eigenvalues exceeds the rank of the transition matrix! Defaulting to the maximum possible number of eigenvalues." ) del Assignments logger.info("Getting %d eigenvalues (timescales) for each lagtime...", NumEigen) lagTimes = range(MinLagtime, MaxLagtime + 1, Interval) logger.info("Building MSMs at the following lag times: %s", lagTimes) # Get the implied timescales (eigenvalues) impTimes = msm_analysis.get_implied_timescales( AssignmentsFn, lagTimes, n_implied_times=NumEigen, sliding_window=True, symmetrize=symmetrize, n_procs=nProc ) numpy.savetxt(output, impTimes) return
def main(): parser = arglib.ArgumentParser( description='Assign data using a hierarchical clustering') parser.add_argument('hierarchical_clustering_zmatrix', default='./Data/Zmatrix.h5', help='Path to hierarchical clustering zmatrix') parser.add_argument('num_states', help='Number of States', default='none') parser.add_argument('cutoff_distance', help='Maximum cophenetic distance', default='none') parser.add_argument('assignments', type=str) args = parser.parse_args() k = int(args.num_states) if args.num_states != 'none' else None d = float(args.cutoff_distance) if args.cutoff_distance != 'none' else None if k is None and d is None: logger.error( 'You need to supply either a number of states or a cutoff distance' ) sys.exit(1) arglib.die_if_path_exists(args.assignments) assignments = hierarchical_clustering_zmatrix.get_assignments( k=k, cutoff_distance=d) msmbuilder.io.saveh(args.assignments, assignments) logger.info('Saved assignments to %s', args.assignments)
def run(lagtime, assignments, symmetrize='MLE', input_mapping="None", trim=True, out_dir="./Data/"): # set the filenames for output FnTProb = os.path.join(out_dir, "tProb.mtx") FnTCounts = os.path.join(out_dir, "tCounts.mtx") FnMap = os.path.join(out_dir, "Mapping.dat") FnAss = os.path.join(out_dir, "Assignments.Fixed.h5") FnPops = os.path.join(out_dir, "Populations.dat") # make sure none are taken outputlist = [FnTProb, FnTCounts, FnMap, FnAss, FnPops] arglib.die_if_path_exists(outputlist) # Check for valid lag time assert lagtime > 0, 'Please specify a positive lag time.' # if given, apply mapping to assignments if input_mapping != "None": MSMLib.apply_mapping_to_assignments(assignments, input_mapping) n_assigns_before_trim = len(np.where(assignments.flatten() != -1)[0]) counts = MSMLib.get_count_matrix_from_assignments(assignments, lag_time=lagtime, sliding_window=True) rev_counts, t_matrix, populations, mapping = MSMLib.build_msm( counts, symmetrize=symmetrize, ergodic_trimming=trim) if trim: MSMLib.apply_mapping_to_assignments(assignments, mapping) n_assigns_after_trim = len(np.where(assignments.flatten() != -1)[0]) # if had input mapping, then update it if input_mapping != "None": mapping = mapping[input_mapping] # Print a statement showing how much data was discarded in trimming percent = (1.0 - float(n_assigns_after_trim) / float(n_assigns_before_trim)) * 100.0 logger.warning("Ergodic trimming discarded: %f percent of your data", percent) else: logger.warning("No ergodic trimming applied") # Save all output np.savetxt(FnPops, populations) np.savetxt(FnMap, mapping, "%d") scipy.io.mmwrite(str(FnTProb), t_matrix) scipy.io.mmwrite(str(FnTCounts), rev_counts) io.saveh(FnAss, assignments) for output in outputlist: logger.info("Wrote: %s", output) return
def entry_point(): args = parser.parse_args() # load args try: assignments = io.loadh(args.assignments, 'arr_0') except KeyError: assignments = io.loadh(args.assignments, 'Data') tProb = scipy.io.mmread(args.tProb) # workaround for arglib funniness? if args.do_minimization in ["False", "0"]: args.do_minimization = False else: args.do_minimization = True if args.algorithm == 'PCCA': MacroAssignmentsFn = os.path.join(args.output_dir, "MacroAssignments.h5") MacroMapFn = os.path.join(args.output_dir, "MacroMapping.dat") arglib.die_if_path_exists([MacroAssignmentsFn, MacroMapFn]) MAP, assignments = run_pcca(args.num_macrostates, assignments, tProb) np.savetxt(MacroMapFn, MAP, "%d") io.saveh(MacroAssignmentsFn, assignments) logger.info("Saved output to: %s, %s", MacroAssignmentsFn, MacroMapFn) elif args.algorithm == 'PCCA+': MacroAssignmentsFn = os.path.join(args.output_dir, "MacroAssignments.h5") MacroMapFn = os.path.join(args.output_dir, "MacroMapping.dat") ChiFn = os.path.join(args.output_dir, 'Chi.dat') AFn = os.path.join(args.output_dir, 'A.dat') arglib.die_if_path_exists([MacroAssignmentsFn, MacroMapFn, ChiFn, AFn]) chi, A, MAP, assignments = run_pcca_plus( args.num_macrostates, assignments, tProb, args.flux_cutoff, objective_function=args.objective_function, do_minimization=args.do_minimization) np.savetxt(ChiFn, chi) np.savetxt(AFn, A) np.savetxt(MacroMapFn, MAP, "%d") io.saveh(MacroAssignmentsFn, assignments) logger.info('Saved output to: %s, %s, %s, %s', ChiFn, AFn, MacroMapFn, MacroAssignmentsFn) else: raise Exception()
def entry_point(): args = parser.parse_args() arglib.die_if_path_exists(args.output) if args.atom_indices.lower() == 'all': atom_indices = None else: atom_indices = np.loadtxt(args.atom_indices).astype(int) project = Project.load_from(args.project) SASA = run(project, atom_indices, args.traj_fn) io.saveh(args.output, SASA)
def entry_point(): args = parser.parse_args() # load args try: assignments = io.loadh(args.assignments, 'arr_0') except KeyError: assignments = io.loadh(args.assignments, 'Data') tProb = scipy.io.mmread(args.tProb) # workaround for arglib funniness? if args.do_minimization in ["False", "0"]: args.do_minimization = False else: args.do_minimization = True if args.algorithm == 'PCCA': MacroAssignmentsFn = os.path.join( args.output_dir, "MacroAssignments.h5") MacroMapFn = os.path.join(args.output_dir, "MacroMapping.dat") arglib.die_if_path_exists([MacroAssignmentsFn, MacroMapFn]) MAP, assignments = run_pcca(args.num_macrostates, assignments, tProb) np.savetxt(MacroMapFn, MAP, "%d") io.saveh(MacroAssignmentsFn, assignments) logger.info("Saved output to: %s, %s", MacroAssignmentsFn, MacroMapFn) elif args.algorithm == 'PCCA+': MacroAssignmentsFn = os.path.join( args.output_dir, "MacroAssignments.h5") MacroMapFn = os.path.join(args.output_dir, "MacroMapping.dat") ChiFn = os.path.join(args.output_dir, 'Chi.dat') AFn = os.path.join(args.output_dir, 'A.dat') arglib.die_if_path_exists([MacroAssignmentsFn, MacroMapFn, ChiFn, AFn]) chi, A, MAP, assignments = run_pcca_plus(args.num_macrostates, assignments, tProb, args.flux_cutoff, objective_function=args.objective_function, do_minimization=args.do_minimization) np.savetxt(ChiFn, chi) np.savetxt(AFn, A) np.savetxt(MacroMapFn, MAP, "%d") io.saveh(MacroAssignmentsFn, assignments) logger.info('Saved output to: %s, %s, %s, %s', ChiFn, AFn, MacroMapFn, MacroAssignmentsFn) else: raise Exception()
def run(lagtime, assignments, symmetrize='MLE', input_mapping="None", trim=True, out_dir="./Data/"): # set the filenames for output FnTProb = os.path.join(out_dir, "tProb.mtx") FnTCounts = os.path.join(out_dir, "tCounts.mtx") FnMap = os.path.join(out_dir, "Mapping.dat") FnAss = os.path.join(out_dir, "Assignments.Fixed.h5") FnPops = os.path.join(out_dir, "Populations.dat") # make sure none are taken outputlist = [FnTProb, FnTCounts, FnMap, FnAss, FnPops] arglib.die_if_path_exists(outputlist) # Check for valid lag time assert lagtime > 0, 'Please specify a positive lag time.' # if given, apply mapping to assignments if input_mapping != "None": MSMLib.apply_mapping_to_assignments(assignments, input_mapping) n_assigns_before_trim = len(np.where(assignments.flatten() != -1)[0]) counts = MSMLib.get_count_matrix_from_assignments(assignments, lag_time=lagtime, sliding_window=True) rev_counts, t_matrix, populations, mapping = MSMLib.build_msm(counts, symmetrize=symmetrize, ergodic_trimming=trim) if trim: MSMLib.apply_mapping_to_assignments(assignments, mapping) n_assigns_after_trim = len(np.where(assignments.flatten() != -1)[0]) # if had input mapping, then update it if input_mapping != "None": mapping = mapping[input_mapping] # Print a statement showing how much data was discarded in trimming percent = (1.0 - float(n_assigns_after_trim) / float(n_assigns_before_trim)) * 100.0 logger.warning("Ergodic trimming discarded: %f percent of your data", percent) else: logger.warning("No ergodic trimming applied") # Save all output np.savetxt(FnPops, populations) np.savetxt(FnMap, mapping, "%d") scipy.io.mmwrite(str(FnTProb), t_matrix) scipy.io.mmwrite(str(FnTCounts), rev_counts) io.saveh(FnAss, assignments) for output in outputlist: logger.info("Wrote: %s", output) return
def check_paths(args): if args.alg == 'hierarchical': die_if_path_exists(args.hierarchical_save_zmatrix) else: die_if_path_exists(args.generators) if args.stride == 1: die_if_path_exists(args.assignments) die_if_path_exists(args.distances)
def entry_point(): args = parser.parse_args() k = int(args.num_states) if args.num_states != 'none' else None d = float(args.cutoff_distance) if args.cutoff_distance != 'none' else None arglib.die_if_path_exists(args.assignments) if k is None and d is None: logger.error( 'You need to supply either a number of states or a cutoff distance') sys.exit(1) project = Project.load_from(args.project) assignments = main( k, d, args.hierarchical_clustering_zmatrix, args.stride, project) io.saveh(args.assignments, assignments) logger.info('Saved assignments to %s', args.assignments)
def entry_point(): args = parser.parse_args() k = int(args.num_states) if args.num_states != 'none' else None d = float(args.cutoff_distance) if args.cutoff_distance != 'none' else None arglib.die_if_path_exists(args.assignments) if k is None and d is None: logger.error( 'You need to supply either a number of states or a cutoff distance' ) sys.exit(1) project = Project.load_from(args.project) assignments = main(k, d, args.hierarchical_clustering_zmatrix, args.stride, project) io.saveh(args.assignments, assignments) logger.info('Saved assignments to %s', args.assignments)
def entry_point(): args, metric = parser.parse_args() arglib.die_if_path_exists(args.output) project = Project.load_from(args.project) pdb = md.load(args.pdb) if args.traj_fn.lower() == 'all': traj_fn = None else: traj_fn = args.traj_fn distances = run(project, pdb, metric, traj_fn) io.saveh(args.output, distances) logger.info('Saved to %s', args.output)
def run_pcca(num_macrostates, assignments, tProb, output_dir): MacroAssignmentsFn = os.path.join(output_dir, "MacroAssignments.h5") MacroMapFn = os.path.join(output_dir, "MacroMapping.dat") arglib.die_if_path_exists([MacroAssignmentsFn, MacroMapFn]) logger.info("Running PCCA...") MAP = lumping.PCCA(tProb, num_macrostates) # MAP the new assignments and save, make sure don't # mess up negaitve one's (ie where don't have data) MSMLib.apply_mapping_to_assignments(assignments, MAP) np.savetxt(MacroMapFn, MAP, "%d") msmbuilder.io.saveh(MacroAssignmentsFn, assignments) logger.info("Saved output to: %s, %s", MacroAssignmentsFn, MacroMapFn)
def entry_point(): args = parser.parse_args() arglib.die_if_path_exists(args.output) try: assignments = io.loadh(args.assignments, 'arr_0') distances = io.loadh(args.distances, 'arr_0') except KeyError: assignments = io.loadh(args.assignments, 'Data') distances = io.loadh(args.distances, 'Data') trimmed = run(assignments, distances, args.rmsd_cutoff) io.saveh(args.output, trimmed) logger.info('Saved output to %s', args.output)
def entry_point(): args = parser.parse_args() arglib.die_if_path_exists(args.output) LagTimes = args.lagtime.split(',') MinLagtime = int(LagTimes[0]) MaxLagtime = int(LagTimes[1]) # Pass the symmetric flag if args.symmetrize in ["None", "none", None]: args.symmetrize = None impTimes = run(MinLagtime, MaxLagtime, args.interval, args.eigvals, args.assignments, (not args.notrim), args.symmetrize, args.procs) np.savetxt(args.output, impTimes) logger.info("Saved output to %s", args.output)
def entry_point(): args = parser.parse_args() arglib.die_if_path_exists(args.output) LagTimes = args.lagtime.split(',') MinLagtime = int(LagTimes[0]) MaxLagtime = int(LagTimes[1]) # Pass the symmetric flag if args.symmetrize in ["None", "none", None]: args.symmetrize = None impTimes = run( MinLagtime, MaxLagtime, args.interval, args.eigvals, args.assignments, (not args.notrim), args.symmetrize, args.procs) np.savetxt(args.output, impTimes) logger.info("Saved output to %s", args.output)
def entry_point(): args, prep_metric = parser.parse_args() arglib.die_if_path_exists(args.output) if args.atom_indices.lower() == 'all': atom_indices = None else: atom_indices = np.loadtxt(args.atom_indices).astype(int) project = Project.load_from(args.project) min_length = int(float(args.min_length)) # need to convert to float first because int can't # convert a string that is '1E3' for example...weird. tica_obj = run( prep_metric, project, args.delta_time, atom_indices=atom_indices, output=args.output, min_length=min_length, stride=args.stride)
def run(LagTime, assignments, Symmetrize='MLE', input_mapping="None", Prior=0.0, OutDir="./Data/"): # set the filenames for output FnTProb = os.path.join(OutDir, "tProb.mtx") FnTCounts = os.path.join(OutDir, "tCounts.mtx") FnMap = os.path.join(OutDir, "Mapping.dat") FnAss = os.path.join(OutDir, "Assignments.Fixed.h5") FnPops = os.path.join(OutDir, "Populations.dat") # make sure none are taken outputlist = [FnTProb, FnTCounts, FnMap, FnAss, FnPops] arglib.die_if_path_exists(outputlist) # if given, apply mapping to assignments if input_mapping != "None": MSMLib.apply_mapping_to_assignments(assignments, input_mapping) n_states = np.max(assignments.flatten()) + 1 n_assigns_before_trim = len( np.where( assignments.flatten() != -1 )[0] ) rev_counts, t_matrix, populations, mapping = MSMLib.build_msm(assignments, lag_time=LagTime, symmetrize=Symmetrize, sliding_window=True, trim=True) MSMLib.apply_mapping_to_assignments(assignments, mapping) n_assigns_after_trim = len( np.where( assignments.flatten() != -1 )[0] ) # if had input mapping, then update it if input_mapping != "None": mapping = mapping[input_mapping] # Print a statement showing how much data was discarded in trimming percent = (1.0 - float(n_assigns_after_trim) / float(n_assigns_before_trim)) * 100.0 logger.warning("Ergodic trimming discarded: %f percent of your data", percent) # Save all output np.savetxt(FnPops, populations) np.savetxt(FnMap, mapping,"%d") scipy.io.mmwrite(str(FnTProb), t_matrix) scipy.io.mmwrite(str(FnTCounts), rev_counts) msmbuilder.io.saveh(FnAss, assignments) for output in outputlist: logger.info("Wrote: %s", output) return
def run_pcca_plus(num_macrostates, assignments, tProb, output_dir, flux_cutoff=0.0,objective_function="crispness",do_minimization=True): MacroAssignmentsFn = os.path.join(output_dir, "MacroAssignments.h5") MacroMapFn = os.path.join(output_dir, "MacroMapping.dat") ChiFn = os.path.join(output_dir, 'Chi.dat') AFn = os.path.join(output_dir, 'A.dat') arglib.die_if_path_exists([MacroAssignmentsFn, MacroMapFn, ChiFn, AFn]) logger.info("Running PCCA+...") A, chi, vr, MAP = lumping.pcca_plus(tProb, num_macrostates, flux_cutoff=flux_cutoff, do_minimization=do_minimization, objective_function=objective_function) MSMLib.apply_mapping_to_assignments(assignments, MAP) np.savetxt(ChiFn, chi) np.savetxt(AFn, A) np.savetxt(MacroMapFn, MAP,"%d") msmbuilder.io.saveh(MacroAssignmentsFn, assignments) logger.info('Saved output to: %s, %s, %s, %s', ChiFn, AFn, MacroMapFn, MacroAssignmentsFn)
def entry_point(): args = parser.parse_args() T = scipy.io.mmread(args.tProb) state = int(args.state) print(args.state, state) # Check output isn't taken if state == -1: base_filename = "PairwiseMFPTs.dat" else: base_filename = "MFPTs_%d.dat" % state output_filename = os.path.join(args.output_dir, base_filename) arglib.die_if_path_exists(output_filename) MFPTs = run(T, state) np.savetxt(output_filename, MFPTs) logger.info("Saved output to %s" % output_filename)
def entry_point(): args, prep_metric = parser.parse_args() arglib.die_if_path_exists(args.output) if args.atom_indices.lower() == 'all': atom_indices = None else: atom_indices = np.loadtxt(args.atom_indices).astype(int) project = Project.load_from(args.project) min_length = int(float(args.min_length)) # need to convert to float first because int can't # convert a string that is '1E3' for example...weird. tica_obj = run(prep_metric, project, args.delta_time, atom_indices=atom_indices, output=args.output, min_length=min_length, stride=args.stride)
def main(): parser = arglib.ArgumentParser(description='Assign data using a hierarchical clustering') parser.add_argument('hierarchical_clustering_zmatrix', default='./Data/Zmatrix.h5', help='Path to hierarchical clustering zmatrix' ) parser.add_argument('num_states', help='Number of States', default='none') parser.add_argument('cutoff_distance', help='Maximum cophenetic distance', default='none') parser.add_argument('assignments', type=str) args = parser.parse_args() k = int(args.num_states) if args.num_states != 'none' else None d = float(args.cutoff_distance) if args.cutoff_distance != 'none' else None if k is None and d is None: logger.error('You need to supply either a number of states or a cutoff distance') sys.exit(1) arglib.die_if_path_exists(args.assignments) assignments = hierarchical_clustering_zmatrix.get_assignments(k=k, cutoff_distance=d) msmbuilder.io.saveh(args.assignments, assignments) logger.info('Saved assignments to %s', args.assignments)
def save(confs_by_state, states, style, format, outdir): "Save the results to disk" if style == 'sep': for i, trj in enumerate(confs_by_state): for j in xrange(len(trj)): fn = os.path.join(outdir, 'State%d-%d.%s' % (states[i], j, format)) arglib.die_if_path_exists(fn) logger.info("Saving file: %s" % fn) trj[j].save(fn) elif style == 'tps': for i, trj in enumerate(confs_by_state): fn = os.path.join(outdir, 'State%d.%s' % (states[i], format)) arglib.die_if_path_exists(fn) logger.info("Saving file: %s" % fn) trj.save(fn) elif style == 'one': fn = os.path.join(outdir, 'Confs.%s' % format) arglib.die_if_path_exists(fn) logger.info("Saving file: %s" % fn) concatenate_trajectories(confs_by_state).save(fn) else: raise ValueError('Invalid style: %s' % style)
def entry_point(): args = parser.parse_args() F = np.loadtxt(args.ending).astype(int) U = np.loadtxt(args.starting).astype(int) tprob = scipy.io.mmread(args.tprob) # deal with case where have single start or end state # TJL note: this should be taken care of in library now... keeping it just # in case if F.shape == (): tmp = np.zeros(1, dtype=int) tmp[0] = int(F) F = tmp.copy() if U.shape == (): tmp = np.zeros(1, dtype=int) tmp[0] = int(U) U = tmp.copy() arglib.die_if_path_exists(args.output) paths, bottlenecks, fluxes = run(tprob, U, F, args.number) io.saveh(args.output, Paths=paths, Bottlenecks=bottlenecks, fluxes=fluxes) logger.info('Saved output to %s', args.output)
def main_extract(args): "main method for the extract subcommand" project = Project.load_from(args.project_info) close = int(args.close) stride = int(args.stride) if args.far < 0: far = None else: far = args.far die_if_path_exists(args.output) if args.extract_method == 'rmsd': atomindices = np.loadtxt(args.atomindices, dtype=int) AtoB, AtoC = triplets.extract_rmsd(project, close, stride, atomindices, far) elif args.extract_method == 'dihedral': if 'types' in args: AtoB, AtoC = triplets.extract_dihedral(project, close, stride, types=args.types, far=far) else: indices = np.loadtxt(args.indices, dtype=int) AtoB, AtoC = triplets.extract_dihedral(project, close, stride, indices=indices, far=far) elif args.extract_method == 'recipcontact': AtoB, AtoC = triplets.extract_recipcontact(project, close, stride, far=far) elif args.extract_method == 'drmsd': indices = np.loadtxt(args.indices, dtype=int) AtoB, AtoC, atom_pairs = triplets.extract_drmsd(project, close, stride, indices=indices, far=far) io.saveh(args.output, atom_pairs=atom_pairs) else: raise NotImplementedError("Sorry, we don't have that metric") #Serializer({'AtoB': AtoB, 'AtoC': AtoC, 'metric': args.extract_method}).SaveToHDF(args.output) io.saveh(args.output, AtoB=AtoB, AtoC=AtoC, metric=np.array(list(args.extract_method))) print 'Saved triplets to {}'.format(args.output)
def main(args, metric): if args.alg == "sclarans" and args.stride != 1: logger.error( """You don't want to use a stride with sclarans. The whole point of sclarans is to use a shrink multiple to accomplish the same purpose, but in parallel with stochastic subsampling. If you cant fit all your frames into memory at the same time, maybe you could stride a little at the begining, but its not recommended.""" ) sys.exit(1) # if we have a metric that explicitly operates on a subset of indices, # then we provide the option to only load those indices into memory # WARNING: I also do something a bit dirty, and inject `None` for the # RMSD.atomindices to get the metric to not splice if isinstance(metric, metrics.RMSD): atom_indices = metric.atomindices metric.atomindices = None # probably bad... logger.info("RMSD metric - loading only the atom indices required") else: atom_indices = None # In case the clustering / algorithm needs extra arguments, use # this dictionary extra_kwargs = {} # Check to be sure we won't overwrite any data if args.alg == "hierarchical": zmatrix_fn = os.path.join(args.output_dir, "ZMatrix.h5") die_if_path_exists(zmatrix_fn) extra_kwargs["zmatrix_fn"] = zmatrix_fn else: generators_fn = os.path.join(args.output_dir, "Gens.lh5") die_if_path_exists(generators_fn) if args.stride == 1: assignments_fn = os.path.join(args.output_dir, "Assignments.h5") distances_fn = os.path.join(args.output_dir, "Assignments.h5.distances") die_if_path_exists([assignments_fn, distances_fn]) trajs = load_trajectories(args.project, args.stride, atom_indices) logger.info("Loaded %d trajs", len(trajs)) clusterer = cluster(metric, trajs, args, **extra_kwargs) if not isinstance(clusterer, clustering.Hierarchical): generators = clusterer.get_generators_as_traj() logger.info("Saving %s", generators_fn) generators.save_to_lhdf(generators_fn) if args.stride == 1: assignments = clusterer.get_assignments() distances = clusterer.get_distances() logger.info("Since stride=1, Saving %s", assignments_fn) logger.info("Since stride=1, Saving %s", distances_fn) io.saveh(assignments_fn, assignments) io.saveh(distances_fn, distances)
parser.add_argument('symmetrize', choices=['none', 'transpose', 'mle']) parser.add_argument('outdir') args = parser.parse_args() assignments = Serializer.LoadData(args.assignments) ratemtx_fn = pjoin(args.outdir, 'K.mtx') tcounts_fn = pjoin(args.outdir, 'tCounts.mtx') unsym_fn = pjoin(args.outdir, 'tCounts.UnSym.mtx') mapping_fn = pjoin(args.outdir, 'Mapping.dat') fixed_fn = pjoin(args.outdir, 'Assignments.Fixed.h5') pops_fn = pjoin(args.outdir, 'Populations.dat') if not os.path.exists(args.outdir): os.mkdir(args.outdir) outlist = [ratemtx_fn, tcounts_fn, unsym_fn, fixed_fn, pops_fn] for e in outlist: arglib.die_if_path_exists(e) # if lag time is not one, there's going to be a unit mispatch between # what you get and what you're expecting. lag_time = 1 counts, rev_counts, t_matrix, populations, mapping = MSMLib.build_msm( assignments, lag_time=lag_time, symmetrize=args.symmetrize) K = MSMLib.estimate_rate_matrix(rev_counts, assignments) np.savetxt(pops_fn, populations) np.savetxt(mapping_fn, mapping, "%d") scipy.io.mmwrite(ratemtx_fn, K) scipy.io.mmwrite(tcounts_fn, rev_counts) scipy.io.mmwrite(unsym_fn, counts) Serializer.SaveData(fixed_fn, assignments)
m = tpt.calculate_mfpt([state], T) logger.info("Finished calculating MFPTs to state %d" % state) else: logger.info("Calculating MFPTs to all states") m = tpt.calculate_all_to_all_mfpt(T) logger.info("Finished calculating MFPTs to all states") return m if __name__ == "__main__": args = parser.parse_args() T = scipy.io.mmread(args.tProb) state = int(args.state) print(args.state, state) # Check output isn't taken if state == -1: base_filename = "PairwiseMFPTs.dat" else: base_filename = "MFPTs_%d.dat" % state output_filename = os.path.join(args.output_dir, base_filename) arglib.die_if_path_exists(output_filename) MFPTs = run(T, state) np.savetxt(output_filename, MFPTs) logger.info("Saved output to %s" % output_filename)
of all atoms in a given trajectory, or for all trajectories in the project. The output is a hdf5 file which contains the SASA for each atom in each frame in each trajectory (or the single trajectory you passed in.""") parser.add_argument('project') parser.add_argument('atom_indices', help='Indices of atoms to calculate SASA', default='all') parser.add_argument('output', help='''hdf5 file for output. Note this will be THREE dimensional: ( trajectory, frame, atom ), unless you just ask for one trajectory, in which case it will be shape (frame, atom).''', default='SASA.h5') parser.add_argument('traj_fn', help='''Pass a trajectory file if you only want to calclate the SASA for a single trajectory''', default='all') args = parser.parse_args() arglib.die_if_path_exists(args.output) if args.atom_indices.lower() == 'all': atom_indices = None else: atom_indices = np.loadtxt(args.atom_indices).astype(int) project = Project.load_from(args.project) SASA = run(project, atom_indices, args.traj_fn) io.saveh(args.output, SASA)
parser.add_argument('ending', help='''Vector of states in the ending/products/folded ensemble.''', default='F_states.dat') parser.add_argument('output_dir', default='.') args = parser.parse_args() T = scipy.io.mmread( args.tProb ) U = np.loadtxt( args.starting ).astype(int) F = np.loadtxt( args.ending ).astype(int) # deal with case where have single start or end state # TJL note: This should be done in the library now... but leaving it if U.shape == (): tmp = np.zeros(1, dtype=int) tmp[0] = int(U) U = tmp.copy() if F.shape == (): tmp = np.zeros(1, dtype=int) tmp[0] = int(F) F = tmp.copy() # Check output isn't taken output_list = ["committors.dat", "net_flux.mtx"] output_flist = [os.path.join(args.output_dir, f) for f in output_list] arglib.die_if_path_exists(output_flist) Fc, NFlux = run(T, U, F) np.savetxt(output_flist[0], Fc) scipy.io.mmwrite(output_flist[1], NFlux) logger.info("Saved output to %s", ', '.join(output_flist))
def main(args, metric): if args.alg == 'sclarans' and args.stride != 1: logger.error("""You don't want to use a stride with sclarans. The whole point of sclarans is to use a shrink multiple to accomplish the same purpose, but in parallel with stochastic subsampling. If you cant fit all your frames into memory at the same time, maybe you could stride a little at the begining, but its not recommended.""") sys.exit(1) # if we have a metric that explicitly operates on a subset of indices, # then we provide the option to only load those indices into memory # WARNING: I also do something a bit dirty, and inject `None` for the # RMSD.atomindices to get the metric to not splice if isinstance(metric, metrics.RMSD): atom_indices = metric.atomindices metric.atomindices = None # probably bad... logger.info('RMSD metric - loading only the atom indices required') else: atom_indices = None # In case the clustering / algorithm needs extra arguments, use # this dictionary extra_kwargs = {} # Check to be sure we won't overwrite any data if args.alg == 'hierarchical': zmatrix_fn = os.path.join(args.output_dir, 'ZMatrix.h5') die_if_path_exists(zmatrix_fn) extra_kwargs['zmatrix_fn'] = zmatrix_fn else: generators_fn = os.path.join(args.output_dir, 'Gens.h5') die_if_path_exists(generators_fn) if args.stride == 1: assignments_fn = os.path.join(args.output_dir, 'Assignments.h5') distances_fn = os.path.join(args.output_dir, 'Assignments.h5.distances') die_if_path_exists([assignments_fn, distances_fn]) project = Project.load_from(args.project) if isinstance(metric, metrics.Vectorized) and not args.alg == 'hierarchical': # if the metric is vectorized then # we can load prepared trajectories # which may allow for better memory # efficiency ptrajs, which = load_prep_trajectories(project, args.stride, atom_indices, metric) trajectories = None n_trajs = len(ptrajs) num_frames = np.sum([len(p) for p in ptrajs]) if num_frames != len(which): raise Exception("something went wrong in loading step (%d v %d)" % (num_frames, len(which))) else: trajectories = load_trajectories(project, args.stride, atom_indices) ptrajs = None which = None n_trajs = len(trajectories) logger.info('Loaded %d trajs', n_trajs) clusterer = cluster(metric, trajectories, ptrajs, args, **extra_kwargs) if not isinstance(clusterer, clustering.Hierarchical): if isinstance(metric, metrics.Vectorized): gen_inds = clusterer.get_generator_indices() generators = project.load_frame(which[gen_inds,0], which[gen_inds,1]) else: generators = clusterer.get_generators_as_traj() logger.info('Saving %s', generators_fn) generators.save(generators_fn) if args.stride == 1: assignments = clusterer.get_assignments() distances = clusterer.get_distances() logger.info('Since stride=1, Saving %s', assignments_fn) logger.info('Since stride=1, Saving %s', distances_fn) io.saveh(assignments_fn, assignments) io.saveh(distances_fn, distances)
def entry_point(): args = parser.parse_args() arglib.die_if_path_exists(args.output) indices = run(args.pdb, args.atom_type) np.savetxt(args.output, indices, '%d') logger.info('Saved output to %s', args.output)
import os, sys from toy_systems.propagators import EDWProp from msmbuilder import arglib, Project parser = arglib.ArgumentParser(description='Create toy data: EDWProp') parser.add_argument('k', description='Steepness f=0.5*k*x^2 in the harmonic directions', default=1, type=float) parser.add_argument('dims', description='number of dimensions', default=2, type=int) parser.add_argument('timesteps', description='number of timesteps', default=10000, type=int) parser.add_argument('num_trajectories', description='number of trajectories', default=1, type=int) parser.add_argument('outdir') args = parser.parse_args() print args arglib.die_if_path_exists(args.outdir) os.mkdir(args.outdir) trj_dir = os.path.abspath(os.path.join(args.outdir, 'Trajectories')) os.mkdir(trj_dir) for i in range(args.num_trajectories): prop = EDWProp(args.dims, args.k) prop.run(args.timesteps) traj = prop.trajectory traj.SaveToHDF(os.path.join(trj_dir, 'trj{0}.h5'.format(i))) pdbfn = os.path.abspath(os.path.join(args.outdir, 'conf.pdb')) with open(pdbfn, 'w') as f: print >> f, "ATOM 1 1HH3 ACE 1 0.000 0.0 0.0\n"
ending/products/folded ensemble.''', default='F_states.dat') parser.add_argument('output_dir', default='.') args = parser.parse_args() T = scipy.io.mmread(args.tProb) U = np.loadtxt(args.starting).astype(int) F = np.loadtxt(args.ending).astype(int) # deal with case where have single start or end state # TJL note: This should be done in the library now... but leaving it if U.shape == (): tmp = np.zeros(1, dtype=int) tmp[0] = int(U) U = tmp.copy() if F.shape == (): tmp = np.zeros(1, dtype=int) tmp[0] = int(F) F = tmp.copy() # Check output isn't taken output_list = ["committors.dat", "net_flux.mtx"] output_flist = [os.path.join(args.output_dir, f) for f in output_list] arglib.die_if_path_exists(output_flist) Fc, NFlux = run(T, U, F) np.savetxt(output_flist[0], Fc) scipy.io.mmwrite(output_flist[1], NFlux) logger.info("Saved output to %s", ', '.join(output_flist))
except KeyError: assignments = io.loadh(args.assignments, 'Data') tProb = scipy.io.mmread(args.tProb) # workaround for arglib funniness? if args.do_minimization in ["False", "0"]: args.do_minimization = False else: args.do_minimization = True if args.algorithm == 'PCCA': MacroAssignmentsFn = os.path.join( args.output_dir, "MacroAssignments.h5") MacroMapFn = os.path.join(args.output_dir, "MacroMapping.dat") arglib.die_if_path_exists([MacroAssignmentsFn, MacroMapFn]) MAP, assignments = run_pcca(args.num_macrostates, assignments, tProb) np.savetxt(MacroMapFn, MAP, "%d") io.saveh(MacroAssignmentsFn, assignments) logger.info("Saved output to: %s, %s", MacroAssignmentsFn, MacroMapFn) elif args.algorithm == 'PCCA+': MacroAssignmentsFn = os.path.join( args.output_dir, "MacroAssignments.h5") MacroMapFn = os.path.join(args.output_dir, "MacroMapping.dat") ChiFn = os.path.join(args.output_dir, 'Chi.dat') AFn = os.path.join(args.output_dir, 'A.dat') arglib.die_if_path_exists([MacroAssignmentsFn, MacroMapFn, ChiFn, AFn])
parser.add_argument("symmetrize", choices=["none", "transpose", "mle"]) parser.add_argument("outdir") args = parser.parse_args() assignments = Serializer.LoadData(args.assignments) ratemtx_fn = pjoin(args.outdir, "K.mtx") tcounts_fn = pjoin(args.outdir, "tCounts.mtx") unsym_fn = pjoin(args.outdir, "tCounts.UnSym.mtx") mapping_fn = pjoin(args.outdir, "Mapping.dat") fixed_fn = pjoin(args.outdir, "Assignments.Fixed.h5") pops_fn = pjoin(args.outdir, "Populations.dat") if not os.path.exists(args.outdir): os.mkdir(args.outdir) outlist = [ratemtx_fn, tcounts_fn, unsym_fn, fixed_fn, pops_fn] for e in outlist: arglib.die_if_path_exists(e) # if lag time is not one, there's going to be a unit mispatch between # what you get and what you're expecting. lag_time = 1 counts, rev_counts, t_matrix, populations, mapping = MSMLib.build_msm( assignments, lag_time=lag_time, symmetrize=args.symmetrize ) K = MSMLib.estimate_rate_matrix(rev_counts, assignments) np.savetxt(pops_fn, populations) np.savetxt(mapping_fn, mapping, "%d") scipy.io.mmwrite(ratemtx_fn, K) scipy.io.mmwrite(tcounts_fn, rev_counts) scipy.io.mmwrite(unsym_fn, counts) Serializer.SaveData(fixed_fn, assignments)
def main(args, metric): if args.alg == 'sclarans' and args.stride != 1: logger.error( """You don't want to use a stride with sclarans. The whole point of sclarans is to use a shrink multiple to accomplish the same purpose, but in parallel with stochastic subsampling. If you cant fit all your frames into memory at the same time, maybe you could stride a little at the begining, but its not recommended.""") sys.exit(1) # if we have a metric that explicitly operates on a subset of indices, # then we provide the option to only load those indices into memory # WARNING: I also do something a bit dirty, and inject `None` for the # RMSD.atomindices to get the metric to not splice if isinstance(metric, metrics.RMSD): atom_indices = metric.atomindices metric.atomindices = None # probably bad... logger.info('RMSD metric - loading only the atom indices required') else: atom_indices = None # In case the clustering / algorithm needs extra arguments, use # this dictionary extra_kwargs = {} # Check to be sure we won't overwrite any data if args.alg == 'hierarchical': zmatrix_fn = os.path.join(args.output_dir, 'ZMatrix.h5') die_if_path_exists(zmatrix_fn) extra_kwargs['zmatrix_fn'] = zmatrix_fn else: generators_fn = os.path.join(args.output_dir, 'Gens.h5') die_if_path_exists(generators_fn) if args.stride == 1: assignments_fn = os.path.join(args.output_dir, 'Assignments.h5') distances_fn = os.path.join(args.output_dir, 'Assignments.h5.distances') die_if_path_exists([assignments_fn, distances_fn]) project = Project.load_from(args.project) if isinstance(metric, metrics.Vectorized) and not args.alg == 'hierarchical': # if the metric is vectorized then # we can load prepared trajectories # which may allow for better memory # efficiency ptrajs, which = load_prep_trajectories(project, args.stride, atom_indices, metric) trajectories = None n_trajs = len(ptrajs) num_frames = np.sum([len(p) for p in ptrajs]) if num_frames != len(which): raise Exception("something went wrong in loading step (%d v %d)" % (num_frames, len(which))) else: trajectories = load_trajectories(project, args.stride, atom_indices) ptrajs = None which = None n_trajs = len(trajectories) logger.info('Loaded %d trajs', n_trajs) clusterer = cluster(metric, trajectories, ptrajs, args, **extra_kwargs) if not isinstance(clusterer, clustering.Hierarchical): if isinstance(metric, metrics.Vectorized): gen_inds = clusterer.get_generator_indices() generators = project.load_frame(which[gen_inds, 0], which[gen_inds, 1]) else: generators = clusterer.get_generators_as_traj() logger.info('Saving %s', generators_fn) generators.save(generators_fn) if args.stride == 1: assignments = clusterer.get_assignments() distances = clusterer.get_distances() logger.info('Since stride=1, Saving %s', assignments_fn) logger.info('Since stride=1, Saving %s', distances_fn) io.saveh(assignments_fn, assignments) io.saveh(distances_fn, distances)
from msmbuilder import io from msmbuilder.clustering import Hierarchical from msmbuilder import arglib import logging logger = logging.getLogger('msmbuilder.scripts.AssignHierarchical') parser = arglib.ArgumentParser(description='Assign data using a hierarchical clustering') parser.add_argument('hierarchical_clustering_zmatrix', default='./Data/ZMatrix.h5', help='Path to hierarchical clustering zmatrix' ) parser.add_argument('num_states', help='Number of States', default='none') parser.add_argument('cutoff_distance', help='Maximum cophenetic distance', default='none') parser.add_argument('assignments', type=str) def main(k, d, zmatrix_fn): hierarchical = Hierarchical.load_from_disk(zmatrix_fn) assignments = hierarchical.get_assignments(k=k, cutoff_distance=d) return assignments if __name__ == "__main__": args = parser.parse_args() k = int(args.num_states) if args.num_states != 'none' else None d = float(args.cutoff_distance) if args.cutoff_distance != 'none' else None arglib.die_if_path_exists(args.assignments) if k is None and d is None: logger.error('You need to supply either a number of states or a cutoff distance') sys.exit(1) assignments = main(k, d, args.hierarchical_clustering_zmatrix) io.saveh(args.assignments, assignments) logger.info('Saved assignments to %s', args.assignments)
radius is simply the average distance of all conformations in a cluster to its generator. Does this by taking averaging the distance of each assigned state to its generator. Output: A flat txt file, 'ClusterRadii.dat', the average RMSD distance to the generator, measured by what ever distance metric was used in assigning.""" ) parser.add_argument("assignments", type=str, default="Data/Assignments.Fixed.h5") parser.add_argument( "distances", help="""Path to assignment distances file.""", default="Data/Assignments.h5.distances", ) parser.add_argument("output", default="ClusterRadii.dat") args = parser.parse_args() arglib.die_if_path_exists(args.output) try: assignments = msmbuilder.io.loadh(args.assignments, "arr_0") distances = msmbuilder.io.loadh(args.distances, "arr_0") except KeyError: assignments = msmbuilder.io.loadh(args.assignments, "Data") distances = msmbuilder.io.loadh(args.distances, "Data") radii = main(assignments, distances) np.savetxt(args.output, radii) logger.info("Wrote: %s", args.output)
def run(lag_time, assignments_list, symmetrize='MLE', input_mapping="None", out_dir="./Data/"): # set the filenames for output tProb_fn = os.path.join(out_dir, "tProb.mtx") tCounts_fn = os.path.join(out_dir, "tCounts.mtx") map_fn = os.path.join(out_dir, "Mapping.dat") pops_fn = os.path.join(out_dir, "Populations.dat") if len(assignments_list) == 1: assignments_fn_list = [os.path.join(out_dir, "Assignments.Fixed.h5")] else: assignments_fn_list = [os.path.join(out_dir, "Assignments.Fixed.%d.h5" % i) for i in xrange(len(assignments_list))] # make sure none are taken output_list = [tProb_fn, tCounts_fn, map_fn, pops_fn] + assignments_fn_list arglib.die_if_path_exists(output_list) # if given, apply mapping to assignments for i in xrange(len(assignments_list)): if input_mapping != "None": MSMLib.apply_mapping_to_assignments(assignments_list[i], input_mapping) n_assigns_before_trim = get_num_assignments(assignments_list) #num_states = np.unique(np.concatenate([ np.unique(ass[np.where(ass != -1)]) # for ass in assignments_list])).shape[0] num_states = np.max([np.max(ass) for ass in assignments_list]) + 1 counts = MSMLib.get_count_matrix_from_assignments(assignments_list[0], n_states=None, lag_time=lag_time, sliding_window=False) for i in xrange(1, len(assignments_list)): print i counts = counts + \ MSMLib.get_count_matrix_from_assignments(assignments_list[i], n_states=num_states, lag_time=lag_time, sliding_window=False) rev_counts, t_matrix, populations, mapping = \ MSMLib.build_msm(counts, symmetrize=symmetrize, ergodic_trimming=True) for i in xrange(len(assignments_list)): MSMLib.apply_mapping_to_assignments(assignments_list[i], mapping) n_assigns_after_trim = get_num_assignments(assignments_list) # if had input mapping, then update it if input_mapping != "None": mapping = mapping[input_mapping] # Print a statement showing how much data was discarded in trimming percent = (1.0 - float(n_assigns_after_trim) / float(n_assigns_before_trim)) * 100.0 logger.warning("Ergodic trimming discarded: " "%f percent of your data", percent) # Save all output scipy.io.mmwrite(tProb_fn, t_matrix) scipy.io.mmwrite(tCounts_fn, rev_counts) np.savetxt(map_fn, mapping, "%d") np.savetxt(pops_fn, populations) for i in xrange(len(assignments_fn_list)): assignments_fn = assignments_fn_list[i] assignments = assignments_list[i] msmbuilder.io.saveh(assignments_fn, assignments) for output in output_list: logger.info("Wrote: %s", output) return