def load(topologies, trajectories, selections, stride, processes): for top, selection in zip(topologies, selections): sentinel_trj = md.load(top) try: # noop, but causes fast-fail w/bad args.atoms sentinel_trj.top.select(selection) except: raise exception.ImproperlyConfigured( ("The provided selection '{s}' didn't match the topology " "file, {t}").format(s=selection, t=top)) flat_trjs = [] configs = [] n_inds = None for topfile, trjset, selection in zip(topologies, trajectories, selections): top = md.load(topfile).top indices = top.select(selection) if n_inds is not None: if n_inds != len(indices): raise exception.ImproperlyConfigured( ("Selection on topology %s selected %s atoms, but " "other selections selected %s atoms.") % (topfile, len(indices), n_inds)) n_inds = len(indices) for trj in trjset: flat_trjs.append(trj) configs.append({ 'top': top, 'stride': stride, 'atom_indices': indices, }) logger.info( "Loading %s trajectories with %s atoms using %s processes " "(subsampling %s)", len(flat_trjs), len(top.select(selection)), processes, stride) assert len(top.select(selection)) > 0, "No atoms selected for clustering" with timed("Loading took %.1f sec", logger.info): lengths, xyz = load_as_concatenated(flat_trjs, args=configs, processes=processes) with timed("Turned over array in %.2f min", logging.info): tmp_xyz = xyz.copy() del xyz xyz = tmp_xyz logger.info("Loaded %s frames.", len(xyz)) return lengths, xyz, top.subset(top.select(selection))
def process_units(timestep=None, infer_timestep=None): """Take the timestep parameter and infer_timestep parameters from the command line arguments and convert it to the string indicating units (ns) and the factor converting ns to frames. Parameters ---------- timestep : float Ratio of ns to frames. This is typically 10 (for 100 ps timesteps) or 100 (for 10 ps timesteps). infer_timestep : str, path Path to a trajectory containing timestep information to infer the correct timestep from when plotting implied timescales. """ if timestep and infer_timestep: raise exception.ImproperlyConfigured( 'Only one of --timestep and --infer-timestep can be ' 'supplied, you supplied both --timestep=%s and ' '--infer-timestep=%s' % (timestep, infer_timestep)) if timestep: unit_factor = timestep unit_str = 'ns' elif infer_timestep: try: timestep = md.load(infer_timestep).timestep except ValueError: if infer_timestep[-4:] != '.xtc': raise exception.ImproperlyConfigured( "Topologyless formats other than XTC are not supported.") with md.formats.xtc.XTCTrajectoryFile(infer_timestep) as f: xyz, time, step, box = f.read(n_frames=10) timesteps = time[1:] - time[0:-1] assert np.all(timesteps[0] == timesteps) timestep = timesteps[0] unit_factor = 1000 / timestep # units are ps unit_str = 'ns' else: unit_factor = 1 unit_str = 'frames' return unit_factor, unit_str
def calculate_piecewise_helix_vectors(trj, helix_resnums=None, helix_start=None, helix_end=None): """Calculates the vectors along specified alpha-helices for each frame in a trajectory. Vectors are in the direction of the starting residue to the ending residue. Parameters ---------- trj : md.Trajectory object An MDTraj trajectory object containing frames of structures to compute helix-vectors from. helix_resnums : array, shape [n_residues, ], optional, default: None A list of residues that correspond to an alpha-helix. This is useful if residue numbers within a helix are unordinary. If a list of residues is not supplied, a start and stop residue can be specified. helix_start : int, optional, default: None The starting residue of the helix. helix_start : int, optional, default: None The ending residue of the helix. Returns ---------- vectors : array, [n_frames, 3] A list of unit-vectors corresponding to the direction of the specified alpha-helix for each frame in the trajectory. center_coords : array, [n_frames, 3] Each center coordinate of the helix-atoms. Can be used to reconstruct a line going through the alpha-helix. """ if (helix_resnums is None) and ((helix_start is None) or (helix_end is None)): raise exception.ImproperlyConfigured( "Either 'helix_resnums' or 'helix_start' and 'helix_end' " "are required.") elif helix_resnums is None: helix_resnums = np.arange(helix_start, helix_end + 1) top = trj.topology backbone_nums = _get_backbone_nums(top, helix_resnums) backbone_coords = trj.xyz[:, backbone_nums] vectors = _generate_vectors_from_coords(backbone_coords, n_avg=12) center_coords = backbone_coords.mean(axis=1) return vectors, center_coords
def process_command_line(argv): '''Parse the command line and do a first-pass on processing them into a format appropriate for the rest of the script.''' parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Cluster a set (or several sets) of trajectories " "into a single state space based upon RMSD.") # INPUTS parser.add_argument( '--trajectories', required=True, nargs="+", action='append', help="List of paths to aligned trajectory files to cluster. " "All file types that MDTraj supports are supported here.") parser.add_argument( '--topology', required=True, action='append', dest='topologies', help="The topology file for the trajectories. This flag must be" " specified once for each instance of the --trajectories " "flag. The first --topology flag is taken to be the " "topology to use for the first instance of the " "--trajectories flag, and so forth.") # PARAMETERS parser.add_argument('--algorithm', required=True, choices=["khybrid", "kcenters"], help="The clustering algorithm to use.") parser.add_argument( '--atoms', action="append", required=True, help="The atoms from the trajectories (using MDTraj " "atom-selection syntax) to cluster based upon. Specify " "once to apply this selection to every set of " "trajectories specified by the --trajectories flag, or " "once for each different topology (i.e. the number of " "times --trajectories and --topology was specified.)") parser.add_argument( '--rmsd-cutoff', default=None, type=float, help="Produce clusters with a maximum distance to cluster " "center of this value.. Units: nm.") parser.add_argument('--n-clusters', default=None, type=int, help="Produce at least this number of clusters.") parser.add_argument( '--processes', default=cpu_count(), type=int, help="Number processes to use for loading and clustering.") parser.add_argument( '--subsample', default=None, type=int, help="Take only every nth frame when loading trajectories. " "1 implies no subsampling.") parser.add_argument( '--no-reassign', default=False, action='store_true', help="Do not do a reassigment step. Ignored if --subsample is " "not supplied or 1.") # OUTPUT parser.add_argument('--distances', required=True, action=readable_dir, help="The location to write the distances file.") parser.add_argument( '--centers', required=True, action=readable_dir, help="The location to write the cluster center structures.") parser.add_argument( '--assignments', required=True, action=readable_dir, help="The location to write assignments of frames to clusters.") args = parser.parse_args(argv[1:]) if args.rmsd_cutoff is None and args.n_clusters is None: raise exception.ImproperlyConfigured( "At least one of --rmsd-cutoff and --n-clusters is " "required to cluster.") if len(args.atoms) == 1: args.atoms = args.atoms * len(args.trajectories) elif len(args.atoms) != len(args.trajectories): raise exception.ImproperlyConfigured( "Flag --atoms must be provided either once (selection is " "applied to all trajectories) or the same number of times " "--trajectories is supplied.") if len(args.topologies) != len(args.trajectories): raise exception.ImproperlyConfigured( "The number of --topology and --trajectory flags must agree.") if args.algorithm == 'kcenters': args.Clusterer = KCenters elif args.algorithm == 'khybrid': args.Clusterer = KHybrid if args.subsample is None: args.subsample = 1 if args.no_reassign and args.subsample == 1: warnings.warn( "When subsampling is 1 (or unspecified), --no-reassign has no effect." ) if args.centers[args.centers.rfind('.'):] == '.h5': warnings.warn( "You provided a centers file that looks like it's an h5... " "centers are saved as pickle. Are you sure this is what you want?") return args
def process_command_line(argv): '''Parse the command line and do a first-pass on processing them into a format appropriate for the rest of the script.''' parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description="Compute CARDS matricies for a set of trajectories " "and save all matrices and dihedral mappings.\n \n" "Please cite the following papers if you use CARDS with enspara:\n" "[1] Singh, S. and Bowman, G.R.\n" " Journal of Chemical Theory and Computation\n" " 2017 13 (4), 1509-1517\n" " DOI: 10.1021/acs.jctc.6b01181\n" "\n" "[2] Porter,J.R., Zimmerman, M.I., and Bowman G.R.\n" " bioRxiv 431072; doi: https://doi.org/10.1101/431072\n") # INPUTS input_args = parser.add_argument_group("Input Settings") #input_data_group = parser.add_mutually_exclusive_group(required=True) input_args.add_argument( '--trajectories', required=True, nargs="+", action='append', help="List of paths to aligned trajectory files to cluster. " "All file types that MDTraj supports are supported here.") input_args.add_argument('--topology', required=True, action='append', help="The topology file for the trajectories.") # PARAMETERS cards_args = parser.add_argument_group("CARDS Settings") cards_args.add_argument( '--buffer-size', default=15, type=int, help="Size of buffer zone between rotameric states, in degrees.") cards_args.add_argument("--processes", default=max(1, auto_nprocs() / 4), type=int, help="Number of processes to use.") # OUTPUT output_args = parser.add_argument_group("Output Settings") output_args.add_argument( '--matrices', required=True, action=readable_dir, help="The folder location to write the four CARDS matrices (as pickle)." ) output_args.add_argument( '--indices', required=True, action=readable_dir, help="The location to write the dihedral indices file (as CSV).") args = parser.parse_args(argv[1:]) # CARDS FEATURES if not (0 < args.buffer_size < 360): raise exception.ImproperlyConfigured( "The given buffer size (%s) is not possible." % args.buffer_size) return args
def process_command_line(argv): FEATURE_DISTANCES = ['euclidean', 'manhattan'] TRAJECTORY_DISTANCES = ['rmsd'] parser = argparse.ArgumentParser( prog='cluster', formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Cluster a set (or several sets) of trajectories " "into a single state space based upon RMSD.") # INPUTS input_args = parser.add_argument_group("Input Settings") input_data_group = parser.add_mutually_exclusive_group(required=True) input_data_group.add_argument( "--features", nargs='+', help="The h5 file containin observations and features.") input_data_group.add_argument( '--trajectories', nargs="+", action='append', help="List of paths to aligned trajectory files to cluster. " "All file types that MDTraj supports are supported here.") input_args.add_argument( '--topology', action='append', dest='topologies', help="The topology file for the trajectories. This flag must be" " specified once for each instance of the --trajectories " "flag. The first --topology flag is taken to be the " "topology to use for the first instance of the " "--trajectories flag, and so forth.") # PARAMETERS cluster_args = parser.add_argument_group("Clustering Settings") cluster_args.add_argument('--algorithm', required=True, choices=["khybrid", "kcenters"], help="The clustering algorithm to use.") cluster_args.add_argument( '--atoms', action="append", help="When clustering trajectories, specifies which atoms from the " "trajectories (using MDTraj atom-selection syntax) to cluster " "based upon. Specify once to apply this selection to every set " "of trajectories specified by the --trajectories flag, or " "once for each different topology (i.e. the number of " "times --trajectories and --topology was specified.)") cluster_args.add_argument( '--cluster-radius', default=None, type=float, help="Produce clusters with a maximum distance to cluster " "center of this value.") cluster_args.add_argument('--cluster-number', default=None, type=int, help="Produce at least this number of clusters.") cluster_args.add_argument( "--cluster-distance", default=None, choices=FEATURE_DISTANCES + TRAJECTORY_DISTANCES, help="The metric for measuring distances. Some metrics (e.g. rmsd) " "only apply to trajectories, and others only to features.") cluster_args.add_argument( "--cluster-iterations", default=None, type=int, help="The number of refinement iterations to perform. This is only " "relevant to khybrid clustering.") cluster_args.add_argument( '--subsample', default=1, type=int, help="Take only every nth frame when loading trajectories. " "1 implies no subsampling.") # OUTPUT output_args = parser.add_argument_group("Output Settings") output_args.add_argument( '--no-reassign', default=False, action='store_true', help="Do not do a reassigment step. Ignored if --subsample is " "not supplied or 1.") output_args.add_argument('--distances', required=True, action=readable_dir, help="The location to write the distances file.") output_args.add_argument( '--center-features', required=True, action=readable_dir, help="The location to write the cluster center structures.") output_args.add_argument( '--assignments', required=True, action=readable_dir, help="The location to write assignments of frames to clusters.") output_args.add_argument( "--center-indices", required=False, action=readable_dir, help="Location for cluster center indices output (pickle).") args = parser.parse_args(argv[1:]) if args.features: args.features = expand_files([args.features])[0] if mpi_mode and len(args.features) == 1: raise exception.ImproperlyConfigured( 'Cannot use ragged array h5 files in MPI mode.') if args.cluster_distance in FEATURE_DISTANCES: args.cluster_distance = getattr(libdist, args.cluster_distance) else: raise exception.ImproperlyConfigured( "The given distance (%s) is not compatible with features." % args.cluster_distance) if args.subsample != 1 and len(features) == 1: raise exception.ImproperlyConfigured( "Subsampling is not supported for h5 inputs.") # TODO: not necessary if mutually exclusvie above works if args.trajectories: raise exception.ImproperlyConfigured( "--features and --trajectories are mutually exclusive. " "Either trajectories or features, not both, are clustered.") if args.topologies: raise exception.ImproperlyConfigured( "When --features is specified, --topology is unneccessary.") if args.atoms: raise exception.ImproperlyConfigured( "Option --atoms is only meaningful when clustering " "trajectories.") if not args.cluster_distance: raise exception.ImproperlyConfigured( "Option --cluster-distance is required when clustering " "features.") elif args.trajectories and args.topologies: args.trajectories = expand_files(args.trajectories) if not args.cluster_distance or args.cluster_distance == 'rmsd': args.cluster_distance = md.rmsd else: raise exception.ImproperlyConfigured( "Option --cluster-distance must be rmsd when clustering " "trajectories.") if not args.atoms: raise exception.ImproperlyConfigured( "Option --atoms is required when clustering trajectories.") elif len(args.atoms) == 1: args.atoms = args.atoms * len(args.trajectories) elif len(args.atoms) != len(args.trajectories): raise exception.ImproperlyConfigured( "Flag --atoms must be provided either once (selection is " "applied to all trajectories) or the same number of times " "--trajectories is supplied.") if len(args.topologies) != len(args.trajectories): raise exception.ImproperlyConfigured( "The number of --topology and --trajectory flags must agree.") else: # CANNOT CLUSTER raise exception.ImproperlyConfigured( "Either --features or both of --trajectories and --topologies " "are required.") if args.cluster_radius is None and args.cluster_number is None: raise exception.ImproperlyConfigured( "At least one of --cluster-radius and --cluster-number is " "required to cluster.") if args.algorithm == 'kcenters': args.Clusterer = KCenters if args.cluster_iterations is not None: raise exception.ImproperlyConfigured( "--cluster-iterations only has an effect when using an " "interative clustering scheme (e.g. khybrid).") elif args.algorithm == 'khybrid': args.Clusterer = KHybrid if args.no_reassign and args.subsample == 1: warnings.warn("When subsampling is 1 (or unspecified), " "--no-reassign has no effect.") if not args.no_reassign and mpi_mode and args.subsample > 1: warnings.warn("Reassignment is suppressed in MPI mode.") args.no_reassign = True if args.trajectories: if os.path.splitext(args.center_features)[1] == '.h5': warnings.warn( "You provided a centers file that looks like it's an h5... " "centers are saved as pickle. Are you sure this is what you " "want?") else: if os.path.splitext(args.center_features)[1] != '.npy': warnings.warn( "You provided a centers file that looks like it's not " "an npy, but this is how they are saved. Are you sure " "this is what you want?") return args