示例#1
0
def load(topologies, trajectories, selections, stride, processes):

    for top, selection in zip(topologies, selections):
        sentinel_trj = md.load(top)
        try:
            # noop, but causes fast-fail w/bad args.atoms
            sentinel_trj.top.select(selection)
        except:
            raise exception.ImproperlyConfigured(
                ("The provided selection '{s}' didn't match the topology "
                 "file, {t}").format(s=selection, t=top))

    flat_trjs = []
    configs = []
    n_inds = None

    for topfile, trjset, selection in zip(topologies, trajectories,
                                          selections):
        top = md.load(topfile).top
        indices = top.select(selection)

        if n_inds is not None:
            if n_inds != len(indices):
                raise exception.ImproperlyConfigured(
                    ("Selection on topology %s selected %s atoms, but "
                     "other selections selected %s atoms.") %
                    (topfile, len(indices), n_inds))
        n_inds = len(indices)

        for trj in trjset:
            flat_trjs.append(trj)
            configs.append({
                'top': top,
                'stride': stride,
                'atom_indices': indices,
            })

    logger.info(
        "Loading %s trajectories with %s atoms using %s processes "
        "(subsampling %s)", len(flat_trjs), len(top.select(selection)),
        processes, stride)
    assert len(top.select(selection)) > 0, "No atoms selected for clustering"

    with timed("Loading took %.1f sec", logger.info):
        lengths, xyz = load_as_concatenated(flat_trjs,
                                            args=configs,
                                            processes=processes)

    with timed("Turned over array in %.2f min", logging.info):
        tmp_xyz = xyz.copy()
        del xyz
        xyz = tmp_xyz

    logger.info("Loaded %s frames.", len(xyz))

    return lengths, xyz, top.subset(top.select(selection))
示例#2
0
def process_units(timestep=None, infer_timestep=None):
    """Take the timestep parameter and infer_timestep parameters from
    the command line arguments and convert it to the string indicating
    units (ns) and the factor converting ns to frames.

    Parameters
    ----------
    timestep : float
        Ratio of ns to frames. This is typically 10 (for 100 ps
        timesteps) or 100 (for 10 ps timesteps).
    infer_timestep : str, path
        Path to a trajectory containing timestep information to infer
        the correct timestep from when plotting implied timescales.
    """

    if timestep and infer_timestep:
        raise exception.ImproperlyConfigured(
            'Only one of --timestep and --infer-timestep can be '
            'supplied, you supplied both --timestep=%s and '
            '--infer-timestep=%s' % (timestep, infer_timestep))

    if timestep:
        unit_factor = timestep
        unit_str = 'ns'
    elif infer_timestep:
        try:
            timestep = md.load(infer_timestep).timestep
        except ValueError:
            if infer_timestep[-4:] != '.xtc':
                raise exception.ImproperlyConfigured(
                    "Topologyless formats other than XTC are not supported.")
            with md.formats.xtc.XTCTrajectoryFile(infer_timestep) as f:
                xyz, time, step, box = f.read(n_frames=10)
                timesteps = time[1:] - time[0:-1]
                assert np.all(timesteps[0] == timesteps)
                timestep = timesteps[0]
        unit_factor = 1000 / timestep  # units are ps
        unit_str = 'ns'
    else:
        unit_factor = 1
        unit_str = 'frames'

    return unit_factor, unit_str
示例#3
0
def calculate_piecewise_helix_vectors(trj,
                                      helix_resnums=None,
                                      helix_start=None,
                                      helix_end=None):
    """Calculates the vectors along specified alpha-helices for each
    frame in a trajectory. Vectors are in the direction of the starting
    residue to the ending residue.
    Parameters
    ----------
    trj : md.Trajectory object
        An MDTraj trajectory object containing frames of structures to
        compute helix-vectors from.
    helix_resnums : array, shape [n_residues, ], optional, default: None
        A list of residues that correspond to an alpha-helix. This is
        useful if residue numbers within a helix are unordinary. If a
        list of residues is not supplied, a start and stop residue can
        be specified.
    helix_start : int, optional, default: None
        The starting residue of the helix.
    helix_start : int, optional, default: None
        The ending residue of the helix.
    Returns
    ----------
    vectors : array, [n_frames, 3]
        A list of unit-vectors corresponding to the direction of the
        specified alpha-helix for each frame in the trajectory.
    center_coords : array, [n_frames, 3]
        Each center coordinate of the helix-atoms. Can be used to
        reconstruct a line going through the alpha-helix.
    """
    if (helix_resnums is None) and ((helix_start is None) or
                                    (helix_end is None)):
        raise exception.ImproperlyConfigured(
            "Either 'helix_resnums' or 'helix_start' and 'helix_end' "
            "are required.")
    elif helix_resnums is None:
        helix_resnums = np.arange(helix_start, helix_end + 1)
    top = trj.topology
    backbone_nums = _get_backbone_nums(top, helix_resnums)
    backbone_coords = trj.xyz[:, backbone_nums]
    vectors = _generate_vectors_from_coords(backbone_coords, n_avg=12)
    center_coords = backbone_coords.mean(axis=1)
    return vectors, center_coords
示例#4
0
def process_command_line(argv):
    '''Parse the command line and do a first-pass on processing them into a
    format appropriate for the rest of the script.'''

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Cluster a set (or several sets) of trajectories "
        "into a single state space based upon RMSD.")

    # INPUTS
    parser.add_argument(
        '--trajectories',
        required=True,
        nargs="+",
        action='append',
        help="List of paths to aligned trajectory files to cluster. "
        "All file types that MDTraj supports are supported here.")
    parser.add_argument(
        '--topology',
        required=True,
        action='append',
        dest='topologies',
        help="The topology file for the trajectories. This flag must be"
        " specified once for each instance of the --trajectories "
        "flag. The first --topology flag is taken to be the "
        "topology to use for the first instance of the "
        "--trajectories flag, and so forth.")

    # PARAMETERS
    parser.add_argument('--algorithm',
                        required=True,
                        choices=["khybrid", "kcenters"],
                        help="The clustering algorithm to use.")
    parser.add_argument(
        '--atoms',
        action="append",
        required=True,
        help="The atoms from the trajectories (using MDTraj "
        "atom-selection syntax) to cluster based upon. Specify "
        "once to apply this selection to every set of "
        "trajectories specified by the --trajectories flag, or "
        "once for each different topology (i.e. the number of "
        "times --trajectories and --topology was specified.)")
    parser.add_argument(
        '--rmsd-cutoff',
        default=None,
        type=float,
        help="Produce clusters with a maximum distance to cluster "
        "center of this value.. Units: nm.")
    parser.add_argument('--n-clusters',
                        default=None,
                        type=int,
                        help="Produce at least this number of clusters.")
    parser.add_argument(
        '--processes',
        default=cpu_count(),
        type=int,
        help="Number processes to use for loading and clustering.")
    parser.add_argument(
        '--subsample',
        default=None,
        type=int,
        help="Take only every nth frame when loading trajectories. "
        "1 implies no subsampling.")
    parser.add_argument(
        '--no-reassign',
        default=False,
        action='store_true',
        help="Do not do a reassigment step. Ignored if --subsample is "
        "not supplied or 1.")

    # OUTPUT
    parser.add_argument('--distances',
                        required=True,
                        action=readable_dir,
                        help="The location to write the distances file.")
    parser.add_argument(
        '--centers',
        required=True,
        action=readable_dir,
        help="The location to write the cluster center structures.")
    parser.add_argument(
        '--assignments',
        required=True,
        action=readable_dir,
        help="The location to write assignments of frames to clusters.")

    args = parser.parse_args(argv[1:])

    if args.rmsd_cutoff is None and args.n_clusters is None:
        raise exception.ImproperlyConfigured(
            "At least one of --rmsd-cutoff and --n-clusters is "
            "required to cluster.")

    if len(args.atoms) == 1:
        args.atoms = args.atoms * len(args.trajectories)
    elif len(args.atoms) != len(args.trajectories):
        raise exception.ImproperlyConfigured(
            "Flag --atoms must be provided either once (selection is "
            "applied to all trajectories) or the same number of times "
            "--trajectories is supplied.")

    if len(args.topologies) != len(args.trajectories):
        raise exception.ImproperlyConfigured(
            "The number of --topology and --trajectory flags must agree.")

    if args.algorithm == 'kcenters':
        args.Clusterer = KCenters
    elif args.algorithm == 'khybrid':
        args.Clusterer = KHybrid

    if args.subsample is None:
        args.subsample = 1

    if args.no_reassign and args.subsample == 1:
        warnings.warn(
            "When subsampling is 1 (or unspecified), --no-reassign has no effect."
        )

    if args.centers[args.centers.rfind('.'):] == '.h5':
        warnings.warn(
            "You provided a centers file that looks like it's an h5... "
            "centers are saved as pickle. Are you sure this is what you want?")

    return args
示例#5
0
def process_command_line(argv):
    '''Parse the command line and do a first-pass on processing them into a
    format appropriate for the rest of the script.'''

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description="Compute CARDS matricies for a set of trajectories "
        "and save all matrices and dihedral mappings.\n \n"
        "Please cite the following papers if you use CARDS with enspara:\n"
        "[1] Singh, S. and Bowman, G.R.\n"
        "    Journal of Chemical Theory and Computation\n"
        "    2017 13 (4), 1509-1517\n"
        "    DOI: 10.1021/acs.jctc.6b01181\n"
        "\n"
        "[2] Porter,J.R.,  Zimmerman, M.I., and Bowman G.R.\n"
        "    bioRxiv 431072; doi: https://doi.org/10.1101/431072\n")

    # INPUTS
    input_args = parser.add_argument_group("Input Settings")
    #input_data_group = parser.add_mutually_exclusive_group(required=True)
    input_args.add_argument(
        '--trajectories',
        required=True,
        nargs="+",
        action='append',
        help="List of paths to aligned trajectory files to cluster. "
        "All file types that MDTraj supports are supported here.")
    input_args.add_argument('--topology',
                            required=True,
                            action='append',
                            help="The topology file for the trajectories.")

    # PARAMETERS
    cards_args = parser.add_argument_group("CARDS Settings")
    cards_args.add_argument(
        '--buffer-size',
        default=15,
        type=int,
        help="Size of buffer zone between rotameric states, in degrees.")
    cards_args.add_argument("--processes",
                            default=max(1,
                                        auto_nprocs() / 4),
                            type=int,
                            help="Number of processes to use.")

    # OUTPUT
    output_args = parser.add_argument_group("Output Settings")
    output_args.add_argument(
        '--matrices',
        required=True,
        action=readable_dir,
        help="The folder location to write the four CARDS matrices (as pickle)."
    )
    output_args.add_argument(
        '--indices',
        required=True,
        action=readable_dir,
        help="The location to write the dihedral indices file (as CSV).")

    args = parser.parse_args(argv[1:])

    # CARDS FEATURES
    if not (0 < args.buffer_size < 360):
        raise exception.ImproperlyConfigured(
            "The given buffer size (%s) is not possible." % args.buffer_size)

    return args
示例#6
0
def process_command_line(argv):

    FEATURE_DISTANCES = ['euclidean', 'manhattan']
    TRAJECTORY_DISTANCES = ['rmsd']

    parser = argparse.ArgumentParser(
        prog='cluster',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Cluster a set (or several sets) of trajectories "
        "into a single state space based upon RMSD.")

    # INPUTS
    input_args = parser.add_argument_group("Input Settings")
    input_data_group = parser.add_mutually_exclusive_group(required=True)
    input_data_group.add_argument(
        "--features",
        nargs='+',
        help="The h5 file containin observations and features.")
    input_data_group.add_argument(
        '--trajectories',
        nargs="+",
        action='append',
        help="List of paths to aligned trajectory files to cluster. "
        "All file types that MDTraj supports are supported here.")
    input_args.add_argument(
        '--topology',
        action='append',
        dest='topologies',
        help="The topology file for the trajectories. This flag must be"
        " specified once for each instance of the --trajectories "
        "flag. The first --topology flag is taken to be the "
        "topology to use for the first instance of the "
        "--trajectories flag, and so forth.")

    # PARAMETERS
    cluster_args = parser.add_argument_group("Clustering Settings")
    cluster_args.add_argument('--algorithm',
                              required=True,
                              choices=["khybrid", "kcenters"],
                              help="The clustering algorithm to use.")
    cluster_args.add_argument(
        '--atoms',
        action="append",
        help="When clustering trajectories, specifies which atoms from the "
        "trajectories (using MDTraj atom-selection syntax) to cluster "
        "based upon. Specify once to apply this selection to every set "
        "of trajectories specified by the --trajectories flag, or "
        "once for each different topology (i.e. the number of "
        "times --trajectories and --topology was specified.)")
    cluster_args.add_argument(
        '--cluster-radius',
        default=None,
        type=float,
        help="Produce clusters with a maximum distance to cluster "
        "center of this value.")
    cluster_args.add_argument('--cluster-number',
                              default=None,
                              type=int,
                              help="Produce at least this number of clusters.")
    cluster_args.add_argument(
        "--cluster-distance",
        default=None,
        choices=FEATURE_DISTANCES + TRAJECTORY_DISTANCES,
        help="The metric for measuring distances. Some metrics (e.g. rmsd) "
        "only apply to trajectories, and others only to features.")
    cluster_args.add_argument(
        "--cluster-iterations",
        default=None,
        type=int,
        help="The number of refinement iterations to perform. This is only "
        "relevant to khybrid clustering.")

    cluster_args.add_argument(
        '--subsample',
        default=1,
        type=int,
        help="Take only every nth frame when loading trajectories. "
        "1 implies no subsampling.")

    # OUTPUT
    output_args = parser.add_argument_group("Output Settings")
    output_args.add_argument(
        '--no-reassign',
        default=False,
        action='store_true',
        help="Do not do a reassigment step. Ignored if --subsample is "
        "not supplied or 1.")

    output_args.add_argument('--distances',
                             required=True,
                             action=readable_dir,
                             help="The location to write the distances file.")
    output_args.add_argument(
        '--center-features',
        required=True,
        action=readable_dir,
        help="The location to write the cluster center structures.")
    output_args.add_argument(
        '--assignments',
        required=True,
        action=readable_dir,
        help="The location to write assignments of frames to clusters.")
    output_args.add_argument(
        "--center-indices",
        required=False,
        action=readable_dir,
        help="Location for cluster center indices output (pickle).")

    args = parser.parse_args(argv[1:])

    if args.features:
        args.features = expand_files([args.features])[0]

        if mpi_mode and len(args.features) == 1:
            raise exception.ImproperlyConfigured(
                'Cannot use ragged array h5 files in MPI mode.')

        if args.cluster_distance in FEATURE_DISTANCES:
            args.cluster_distance = getattr(libdist, args.cluster_distance)
        else:
            raise exception.ImproperlyConfigured(
                "The given distance (%s) is not compatible with features." %
                args.cluster_distance)

        if args.subsample != 1 and len(features) == 1:
            raise exception.ImproperlyConfigured(
                "Subsampling is not supported for h5 inputs.")

        # TODO: not necessary if mutually exclusvie above works
        if args.trajectories:
            raise exception.ImproperlyConfigured(
                "--features and --trajectories are mutually exclusive. "
                "Either trajectories or features, not both, are clustered.")
        if args.topologies:
            raise exception.ImproperlyConfigured(
                "When --features is specified, --topology is unneccessary.")
        if args.atoms:
            raise exception.ImproperlyConfigured(
                "Option --atoms is only meaningful when clustering "
                "trajectories.")
        if not args.cluster_distance:
            raise exception.ImproperlyConfigured(
                "Option --cluster-distance is required when clustering "
                "features.")

    elif args.trajectories and args.topologies:
        args.trajectories = expand_files(args.trajectories)

        if not args.cluster_distance or args.cluster_distance == 'rmsd':
            args.cluster_distance = md.rmsd
        else:
            raise exception.ImproperlyConfigured(
                "Option --cluster-distance must be rmsd when clustering "
                "trajectories.")

        if not args.atoms:
            raise exception.ImproperlyConfigured(
                "Option --atoms is required when clustering trajectories.")
        elif len(args.atoms) == 1:
            args.atoms = args.atoms * len(args.trajectories)
        elif len(args.atoms) != len(args.trajectories):
            raise exception.ImproperlyConfigured(
                "Flag --atoms must be provided either once (selection is "
                "applied to all trajectories) or the same number of times "
                "--trajectories is supplied.")

        if len(args.topologies) != len(args.trajectories):
            raise exception.ImproperlyConfigured(
                "The number of --topology and --trajectory flags must agree.")

    else:
        # CANNOT CLUSTER
        raise exception.ImproperlyConfigured(
            "Either --features or both of --trajectories and --topologies "
            "are required.")

    if args.cluster_radius is None and args.cluster_number is None:
        raise exception.ImproperlyConfigured(
            "At least one of --cluster-radius and --cluster-number is "
            "required to cluster.")

    if args.algorithm == 'kcenters':
        args.Clusterer = KCenters
        if args.cluster_iterations is not None:
            raise exception.ImproperlyConfigured(
                "--cluster-iterations only has an effect when using an "
                "interative clustering scheme (e.g. khybrid).")
    elif args.algorithm == 'khybrid':
        args.Clusterer = KHybrid

    if args.no_reassign and args.subsample == 1:
        warnings.warn("When subsampling is 1 (or unspecified), "
                      "--no-reassign has no effect.")
    if not args.no_reassign and mpi_mode and args.subsample > 1:
        warnings.warn("Reassignment is suppressed in MPI mode.")
        args.no_reassign = True

    if args.trajectories:
        if os.path.splitext(args.center_features)[1] == '.h5':
            warnings.warn(
                "You provided a centers file that looks like it's an h5... "
                "centers are saved as pickle. Are you sure this is what you "
                "want?")
    else:
        if os.path.splitext(args.center_features)[1] != '.npy':
            warnings.warn(
                "You provided a centers file that looks like it's not "
                "an npy, but this is how they are saved. Are you sure "
                "this is what you want?")

    return args