Пример #1
0
class SaveFeaturizer(Command, GaussianFeaturizationMixin):
    name = 'featurizer'
    description = '''Create and save a featurizer for later use.'''

    group_feature = argument_group('Featurizer Loading')
    group_feature.add_argument('--top',
                               type=str,
                               help='''Topology file for
        loading trajectories''',
                               required=True)
    group_feature.add_argument(
        '-o',
        '--filename',
        type=str,
        help=
        '''Output featurizer to this filename. default='featurizer.pickl' ''',
        default='featurizer.pickl')

    def __init__(self, args):
        self.args = args
        if args.top is not None:
            self.top = md.load(os.path.expanduser(args.top))
        else:
            self.top = None

        if args.distance_pairs is not None:
            self.indices = np.loadtxt(args.distance_pairs, dtype=int, ndmin=2)
            if self.indices.shape[1] != 2:
                self.error(
                    'distance-pairs must have shape (N, 2). %s had shape %s' %
                    (args.distance_pairs, self.indices.shape))
            featurizer = AtomPairsFeaturizer(self.indices, self.top)
        else:
            self.indices = np.loadtxt(args.atom_indices, dtype=int, ndmin=2)
            if self.indices.shape[1] != 1:
                self.error(
                    'atom-indices must have shape (N, 1). %s had shape %s' %
                    (args.atom_indices, self.indices.shape))
            self.indices = self.indices.reshape(-1)

            featurizer = SuperposeFeaturizer(self.indices, self.top)

        featurizer.save(args.filename)

    def start(self):
        args = self.args
Пример #2
0
class MDTrajInputMixin(object):
    """Mixin for a command to accept trajectory input files"""
    group_mdtraj = argument_group('MDTraj Options')
    group_mdtraj.add_argument('--dir',
                              type=str,
                              help='''Directory containing
        the trajectories to load''',
                              required=True)
    group_mdtraj.add_argument('--top',
                              type=str,
                              help='''Topology file for
        loading trajectories''',
                              required=True)
    group_mdtraj.add_argument(
        '--ext',
        help='File extension of the trajectories',
        required=True,
        choices=[e[1:] for e in md._FormatRegistry.loaders.keys()])
Пример #3
0
class GaussianFeaturizationMixin(object):
    group_munge = argument_group('Munging Options')
    group_vector = group_munge.add_mutually_exclusive_group(required=True)
    group_vector.add_argument('-d',
                              '--distance-pairs',
                              type=str,
                              help='''Vectorize
        the MD trajectories by extracting timeseries of the distance
        between pairs of atoms in each frame. Supply a text file where
        each row contains the space-separate indices of two atoms which
        form a pair to monitor''')
    group_vector.add_argument('-a',
                              '--atom-indices',
                              type=str,
                              help='''Superpose
        each MD conformation on the coordinates in the topology file, and then use
        the distance from each atom in the reference conformation to the
        corresponding atom in each MD conformation.''')
Пример #4
0
class FitVMHMM(Command, MDTrajInputMixin):
    name = 'fit-vmhmm'
    description = '''Fit von-Mises hidden Markov models with EM.

    The von Mises distribution, (also known as the circular normal
    distribution or Tikhonov distribution) is a continuous probability
    distribution on the circle. For multivariate signals, the emissions
    distribution implemented by this model is a product of univariate
    von Mises distributions -- analogous to the multivariate Gaussian
    distribution with a diagonal covariance matrix.
    
    Because the support of the base 1D distribution is on [-pi, pi), this
    model makes a suitable emission distribution for timeseries of angles
    (e.g. protein dihedral angles).
    '''

    group_munge = argument_group('Munging Options')
    group_munge.add_argument(
        '-d',
        '--dihedral-indices',
        required=True,
        type=str,
        help='''Vectorize the MD trajectories by extracting timeseries of the
        dihedral (torsion) angles between sets of 4 atoms. Supply a text file
        where each row contains the space-separate indices of four atoms which
        form a dihedral angle to monitor. These indices are 0-based.''')

    group_hmm = argument_group('HMM Options')
    group_hmm.add_argument(
        '-k',
        '--n-states',
        action=MultipleIntAction,
        default=[2],
        help='Number of states in the models. Default = [2,]',
        nargs='+')
    group_hmm.add_argument('-l',
                           '--lag-times',
                           action=MultipleIntAction,
                           default=[1],
                           help='Lag time(s) of the model(s). Default = [1,]',
                           nargs='+')
    # group_hmm.add_argument('--platform', choices=['cuda', 'cpu', 'sklearn'],
    #     default='cpu', help='Implementation platform. default="cpu"')
    # group_hmm.add_argument('--fusion-prior', type=float, default=1e-2,
    #    help='Strength of the adaptive fusion prior. default=1e-2')
    group_hmm.add_argument(
        '--n-em-iter',
        type=int,
        default=100,
        help='Maximum number of iterations of EM. default=100')
    group_hmm.add_argument(
        '--thresh',
        type=float,
        default=1e-2,
        help='''Convergence criterion for EM. Quit when the log likelihood
        decreases by less than this threshold. default=1e-2''')
    # group_hmm.add_argument('--n-lqa-iter', type=int, default=10,
    #     help='''Max number of iterations for local quadradric approximation
    #    solving the fusion-L1. default=10''')
    group_hmm.add_argument(
        '--reversible-type',
        choices=['mle', 'transpose'],
        default='mle',
        help='''Method by which the model is constrained to be
        reversible. default="mle"''')
    group_hmm.add_argument('-sp',
                           '--split',
                           type=int,
                           help='''Split
            trajectories into smaller chunks. This looses some counts (i.e. like
            1%% of the counts are lost with --split 100), but can help with speed
            (on gpu + multicore cpu) and numerical instabilities that come when
            trajectories get extremely long.''',
                           default=10000)

    group_out = argument_group('Output')
    group_out.add_argument('-o',
                           '--out',
                           default='hmms.jsonlines',
                           help='Output file. default="hmms.jsonlines"')

    def __init__(self, args):
        self.args = args
        self.top = md.load(args.top) if args.top is not None else None

        self.indices = np.loadtxt(args.dihedral_indices, dtype=int, ndmin=2)
        if self.indices.shape[1] != 4:
            self.error(
                'dihedral-indices must have shape (N, 4). %s had shape %s' %
                (args.dihedral_indices, self.indices.shape))
        self.filenames = glob.glob(args.dir + '/*.' + args.ext)
        self.n_features = self.indices.shape[0]

    def start(self):
        args = self.args
        data = self.load_data()

        with open(args.out, 'a', 0) as outfile:
            outfile.write('# %s\n' % ' '.join(sys.argv))

            for lag_time in args.lag_times:
                subsampled = [d[::lag_time] for d in data]
                for n_states in args.n_states:
                    self.fit(subsampled, n_states, lag_time, outfile)

    def fit(self, data, n_states, lag_time, outfile):
        model = VonMisesHMM(n_states=n_states,
                            reversible_type=self.args.reversible_type,
                            n_iter=self.args.n_em_iter,
                            thresh=self.args.thresh)
        start = time.time()
        model.fit(data)
        end = time.time()

        result = {
            'model': 'VonMisesHMM',
            'timescales': (model.timescales_() * lag_time).tolist(),
            'transmat': model.transmat_.tolist(),
            'populations': model.populations_.tolist(),
            'n_states': model.n_states,
            'split': self.args.split,
            'train_lag_time': lag_time,
            'train_time': end - start,
            'means': model.means_.tolist(),
            'kappas': model.kappas_.tolist(),
            'train_logprobs': model.fit_logprob_,
            'n_train_observations': sum(len(t) for t in data),
        }
        if not np.all(np.isfinite(model.transmat_)):
            print('Nonfinite numbers in transmat !!')
        json.dump(result, outfile)
        outfile.write('\n')

    def load_data(self):
        load_time_start = time.time()
        data = []
        for tfn in self.filenames:
            kwargs = {} if tfn.endswith('h5') else {'top': self.top}
            for t in md.iterload(tfn, chunk=self.args.split, **kwargs):
                item = np.asarray(md.compute_dihedrals(t, self.indices),
                                  np.double)
                data.append(item)
        return data
Пример #5
0
class FitGHMM(Command, MDTrajInputMixin):
    name = 'fit-ghmm'
    description = '''Fit L1-Regularized Reversible Gaussian hidden Markov models with EM.'''

    group_hmm = argument_group('HMM Options')
    group_hmm.add_argument('--featurizer', type=str, required=True,
        help='Path to saved featurizer object')
    group_hmm.add_argument('-k', '--n-states', action=MultipleIntAction, default=[2],
        help='Number of states in the models. Default = [2,]', nargs='+')
    group_hmm.add_argument('-l', '--lag-times', action=MultipleIntAction, default=[1],
        help='Lag time(s) of the model(s). Default = [1,]', nargs='+')
    group_hmm.add_argument('--platform', choices=['cuda', 'cpu', 'sklearn'],
        default='cpu', help='Implementation platform. default="cpu"')
    group_hmm.add_argument('--fusion-prior', type=float, default=1e-2,
        help='Strength of the adaptive fusion prior. default=1e-2')
    group_hmm.add_argument('--n-em-iter', type=int, default=100,
        help='Maximum number of iterations of EM. default=100')
    group_hmm.add_argument('--thresh', type=float, default=1e-2,
        help='''Convergence criterion for EM. Quit when the log likelihood
        decreases by less than this threshold. default=1e-2''')
    group_hmm.add_argument('--n-lqa-iter', type=int, default=10,
        help='''Max number of iterations for local quadradric approximation
        solving the fusion-L1. default=10''')
    group_hmm.add_argument('--reversible-type', choices=['mle', 'transpose'],
        default='mle', help='''Method by which the model is constrained to be
        reversible. default="mle"''')
    group_hmm.add_argument('-sp', '--split', type=int, help='''Split
        trajectories into smaller chunks. This looses some counts (i.e. like
        1%% of the counts are lost with --split 100), but can help with speed
        (on gpu + multicore cpu) and numerical instabilities that come when
        trajectories get extremely long.''', default=10000)

    group_cv = argument_group('Cross Validation')
    group_cv.add_argument('--n-cv', type=int, default=1,
        help='Run N-fold cross validation. default=1')
    # We're training and testing at the same lag time for the moment
    # group_cv.add_argument('--test-lag-time', type=int, default=1,
    #     help='Lag time at which to test the models. default=1')

    group_out = argument_group('Output')
    group_out.add_argument('-o', '--out', default='hmms.jsonlines',
        help='Output file. default="hmms.jsonlines"')


    def __init__(self, args):
        self.args = args
        if args.top is not None:
            self.top = md.load(os.path.expanduser(args.top))
        else:
            self.top = None

        self.featurizer = mixtape.featurizer.load(args.featurizer)
        self.filenames = glob.glob(os.path.expanduser(args.dir) + '/*.' + args.ext)
        self.n_features = self.featurizer.n_features


    def start(self):
        args = self.args
        data = self.load_data()

        with open(args.out, 'a', 0) as outfile:
            outfile.write('# %s\n' % ' '.join(sys.argv))

            for lag_time in args.lag_times:
                subsampled = [d[::lag_time] for d in data]
                for n_states in args.n_states:

                    if args.n_cv > 1:
                        for fold, (train_i, test_i) in enumerate(KFold(n=len(data), n_folds=args.n_cv)):
                            train = [subsampled[i] for i in train_i]
                            test = [subsampled[i] for i in test_i]

                            self.fit(train, test, n_states, lag_time, fold, args, outfile)
                    else:
                        self.fit(subsampled, subsampled, n_states, lag_time, 0, args, outfile)


    def fit(self, train, test, n_states, train_lag_time, fold, args, outfile):
        kwargs = dict(n_states=n_states, n_features=self.n_features, n_em_iter=args.n_em_iter,
            n_lqa_iter = args.n_lqa_iter, fusion_prior=args.fusion_prior,
            thresh=args.thresh, reversible_type=args.reversible_type,
                    platform=args.platform)
        print(kwargs)
        model = GaussianFusionHMM(**kwargs)

        start = time.time()
        model.fit(train)
        end = time.time()

        result = {
            'model': 'GaussianFusionHMM',
            'timescales': (np.real(model.timescales_()) * train_lag_time).tolist(),
            'transmat': np.real(model.transmat_).tolist(),
            'populations': np.real(model.populations_).tolist(),
            'n_states': model.n_states,
            'split': args.split,
            'fusion_prior': args.fusion_prior,
            'train_lag_time': train_lag_time,
            'train_time': end - start,
            'means': np.real(model.means_).tolist(),
            'vars': np.real(model.vars_).tolist(),
            'train_logprob': model.fit_logprob_[-1],
            'n_train_observations': sum(len(t) for t in train),
            'n_test_observations': sum(len(t) for t in test),
            'train_logprobs': model.fit_logprob_,
            #'test_lag_time': args.test_lag_time,
            'cross_validation_fold': fold,
            'cross_validation_nfolds': args.n_cv,
        }

        # model.transmat_ = contraction(model.transmat_, float(train_lag_time) / float(args.test_lag_time))
        # Don't do any contraction -- train and test at the same lagtime
        result['test_logprob'] = model.score(test)
        result['test_lag_time'] = train_lag_time

        if not np.all(np.isfinite(model.transmat_)):
            print('Nonfinite numbers in transmat !!')

        json.dump(result, outfile)
        outfile.write('\n')

    def load_data(self):
        load_time_start = time.time()
        data = []
        for tfn in self.filenames:
            kwargs = {} if tfn.endswith('h5') else {'top': self.top}
            for t in md.iterload(tfn, chunk=self.args.split, **kwargs):
                features = self.featurizer.featurize(t)
                data.append(features)

        print('Loading data into memory + vectorization: %f s' % (time.time() - load_time_start))
        print('Fitting with %s timeseries from %d trajectories with %d total observations' % (
            len(data), len(self.filenames), sum(len(e) for e in data)))

        return data
Пример #6
0
class SampleGHMM(Command, MDTrajInputMixin):
    name = 'sample-ghmm'
    description = '''Draw iid samples from each state in a Gaussian HMM.

    The output is a small CSV file with 3 columns: 'filename', 'index',
    and 'state'. Each row gives the path to a trajectory file, the index
    of a single frame therein, and the state it was drawn from.

    The sampling strategy is as follows: for each state represented by a
    Gaussian distribution, we create a discrete distribution over the
    featurized frames in the specified trajectory files such that the
    discrete distribution has the same mean (and optionally variance) as the
    state Gaussian distribution and minimizes the K-L divergence from the
    discrete distribution to the continuous Gaussian it's trying to model. Then,
    we sample from that discrete distribution and return the corresponding
    frames in a CSV file.

    The reason for this complexity is that the Gaussian distributions for
    each state are continuous distributions over the featurized space. To
    visualize the structures corresponding to each state, however, we would
    need to sample from this distribution and then "invert" the featurization,
    to reconstruct the cartesian coordinates for our samples. Alternatively,
    we can draw from a discrete distribution over our available structures;
    but this introduces the question of what discrete distribution "optimally"
    represents the continuous (Gaussian) distribution of interest.

    [Reference]: Tanaka, Ken'ichiro, and Alexis Akira Toda. "Discrete
    approximations of continuous distributions by maximum entropy."
    Economics Letters 118.3 (2013): 445-450.
    '''

    group = argument_group('I/O Arguments')
    group.add_argument('-i', '--filename', required=True, metavar='JSONLINES_FILE',
        help='''Path to the jsonlines output file containg the HMMs''')
    group.add_argument('--featurizer', type=str, required=True,
        help='Path to saved featurizer object')
    group.add_argument('--n-states', type=int, required=True, help='''Number of
        states in the model to select from''')
    group.add_argument('--n-per-state', type=int, default=3, help='''Number of
        structures to pull from each state''')
    group.add_argument('--lag-time', type=int, required=True, help='''Training lag
        time of the model to select from''')
    group.add_argument('-o', '--out', metavar='OUTPUT_CSV_FILE',
        help='File to which to save the output, in CSV format. default="samples.csv"',
        default='samples.csv')

    match_vars = argument('--match-vars', action=FlagAction, default=True,
         help='''Constrain the discrete distribution to match the
         variances of the continuous distribution. default=enabled''')

    def __init__(self, args):
        if os.path.exists(args.out):
            self.error('IOError: file exists: %s' % args.out)
        matches = [o for o in iterobjects(args.filename)
                   if o['n_states'] == args.n_states
                   and o['train_lag_time'] == args.lag_time]
        if len(matches) == 0:
            self.error('No model with n_states=%d, train_lag_time=%d in %s.' % (
                args.n_states, args.lag_time, args.filename))

        self.args = args
        self.model = matches[0]
        self.out = args.out
        self.topology = md.load(args.top)
        self.filenames = glob.glob(os.path.join(os.path.expanduser(args.dir), '*.%s' % args.ext))
        self.featurizer = mixtape.featurizer.load(args.featurizer)
        self.match_vars = args.match_vars

        if len(self.filenames) == 0:
            self.error('No files matched.')


    def start(self):
        print('loading all data...')
        xx, ii, ff = mixtape.featurizer.featurize_all(self.filenames, self.featurizer, self.topology)
        print('done loading')

        data = {'filename': [], 'index': [], 'state': []}
        for k in range(self.model['n_states']):
            print('computing weights for k=%d...' % k)
            try:
                weights = discrete_approx_mvn(xx, self.model['means'][k],
                    self.model['vars'][k], self.match_vars)
            except NotSatisfiableError:
                self.error('Satisfiability failure. Could not match the means & '
                           'variances w/ discrete distribution. Try removing the '
                           'constraint on the variances with --no-match-vars?')

            cumsum = np.cumsum(weights)
            for i in range(self.args.n_per_state):
                index = np.sum(cumsum < np.random.rand())
                data['filename'].append(ff[index])
                data['index'].append(ii[index])
                data['state'].append(k)

        df = pd.DataFrame(data)
        print('Saving the indices of the sampled states in CSV format to %s' % self.out)
        with open(self.out, 'w') as f:
            f.write("# command: %s\n" % ' '.join(sys.argv))
            df.to_csv(f)
Пример #7
0
class PullMeansGHMM(SampleGHMM):
    name = 'means-ghmm'
    description = '''Draw samples at the center of each state in a Gaussian HMM.'''

    group = argument_group('I/O Arguments')
    group.add_argument(
        '-i',
        '--filename',
        required=True,
        metavar='JSONLINES_FILE',
        help='''Path to the jsonlines output file containg the HMMs''')
    group.add_argument('--featurizer',
                       type=str,
                       required=True,
                       help='Path to saved featurizer object')
    group.add_argument('--n-states',
                       type=int,
                       required=True,
                       help='''Number of
        states in the model to select from''')
    group.add_argument('--n-per-state',
                       type=int,
                       default=1,
                       help='''Select the
        `n-per-state` most representative structures from each state. default=1'''
                       )
    group.add_argument('--lag-time',
                       type=int,
                       required=True,
                       help='''Training lag
        time of the model to select from''')
    group.add_argument(
        '-o',
        '--out',
        metavar='OUTPUT_CSV_FILE',
        help=
        'File to which to save the output, in CSV format. default="means.csv',
        default='means.csv')

    def start(self):
        featurizer = mixtape.featurizer.load(self.args.featurizer)

        features, ii, ff = mixtape.featurizer.featurize_all(
            self.filenames, featurizer, self.topology)
        logprob = log_multivariate_normal_density(features,
                                                  np.array(
                                                      self.model['means']),
                                                  np.array(self.model['vars']),
                                                  covariance_type='diag')

        assignments = np.argmax(logprob, axis=1)
        probs = np.max(logprob, axis=1)

        data = {'filename': [], 'index': [], 'state': []}
        for k in range(self.model['n_states']):
            # pick the structures that have the highest log
            # probability in the state
            p = probs[assignments == k]
            sorted_filenms = ff[assignments == k][p.argsort()]
            sorted_indices = ii[assignments == k][p.argsort()]

            if len(p) > 0:
                data['index'].extend(sorted_indices[-self.args.n_per_state:])
                data['filename'].extend(
                    sorted_filenms[-self.args.n_per_state:])
                data['state'].extend([k] * self.args.n_per_state)
            else:
                print('WARNING: NO STRUCTURES ASSIGNED TO STATE=%d' % k)

        df = pd.DataFrame(data)
        print('Saving the indices of the selected frames in CSV format to %s' %
              self.out)
        with open(self.out, 'w') as f:
            f.write("# command: %s\n" % ' '.join(sys.argv))
            df.to_csv(f)
Пример #8
0
class DihedralIndices(Command):
    description = "Create index file for dihedral angles."
    pdb = argument('-p', '--pdb', required=True, help='Path to PDB file')
    out = argument('-o', '--out', required=True, help='Path to output file')

    section2 = argument_group(
        description='Selection Criteria: Choose One or More')
    section2.add_argument('--phi',
                          action='store_true',
                          help='''Backbone phi
        (C-N-CA-C) angles''')
    section2.add_argument('--psi',
                          action='store_true',
                          help='''Backbone psi
        (N-CA-C-N) angles''')
    section2.add_argument('--omega',
                          action='store_true',
                          help='''Backbone omega
        (CA-C-N-CA) angles''')
    section2.add_argument('--chi1',
                          action='store_true',
                          help='''Chi1 is the
        first side chain torsion angle formed between the 4 atoms over the
        CA-CB axis.''')
    section2.add_argument('--chi2',
                          action='store_true',
                          help='''Chi2 is the
        second side chain torsion angle formed between the corresponding 4
        atoms over the CB-CG axis.''')
    section2.add_argument('--chi3',
                          action='store_true',
                          help='''Chi3 is the
        third side chain torsion angle formed between the corresponding 4 atoms
        over the CG-CD axis (only the residues ARG, GLN, GLU, LYS & MET have
        these atoms)''')
    section2.add_argument('--chi4',
                          action='store_true',
                          help='''Chi4 is the
        fourth side chain torsion angle formed between the corresponding 4
        atoms over the CD-CE or CD-NE axis (only ARG & LYS residues have these
        atoms)''')

    def __init__(self, args):
        self.args = args
        if os.path.exists(args.out):
            self.error('IOError: file exists: %s' % args.out)
        self.pdb = md.load(args.pdb)
        print('Loaded pdb containing (%d) chains, (%d) residues, (%d) atoms.' %
              (self.pdb.topology.n_chains, self.pdb.topology.n_residues,
               self.pdb.topology.n_atoms))

    def start(self):
        dihedral_atom_types = []
        if self.args.phi:
            dihedral_atom_types.append(PHI_ATOMS)
        if self.args.psi:
            dihedral_atom_types.append(PSI_ATOMS)
        if self.args.omega:
            dihedral_atom_types.append(OMEGA_ATOMS)
        if self.args.chi1:
            dihedral_atom_types.extend(CHI1_ATOMS)
        if self.args.chi2:
            dihedral_atom_types.extend(CHI2_ATOMS)
        if self.args.chi3:
            dihedral_atom_types.extend(CHI3_ATOMS)
        if self.args.chi4:
            dihedral_atom_types.extend(CHI4_ATOMS)

        rids, indices = zip(*(_atom_sequence(self.pdb, atoms)
                              for atoms in dihedral_atom_types))
        rids = np.concatenate(rids)
        id_sort = np.argsort(rids)
        if not any(x.size for x in indices):
            self.error('No dihedral angles matched.')
        indices = np.vstack(x for x in indices if x.size)[id_sort]
        print('Selected (%d) dihedrals from (%d) unique residues.' %
              (len(indices), len(np.unique(rids))))
        np.savetxt(self.args.out, indices, '%d')
Пример #9
0
class AtomIndices(Command):
    description="Create index file for atoms or distance pairs."
    pdb = argument('-p', '--pdb', required=True, help='Path to PDB file')
    out = argument('-o', '--out', required=True, help='Path to output file')

    section1 = argument_group(description='Mode: Choose One')
    group1 = section1.add_mutually_exclusive_group(required=True)
    group1.add_argument('-d', '--distance-pairs', action='store_true',
        help='''Create a 2-dimensional index file with (N choose 2) rows and 2
        columns, where each row specifies a pair of indices. All (N choose 2)
        pairs of the selected atoms will be written.''')
    group1.add_argument('-a', '--atoms', action='store_true',
        help='''Create a 1-dimensional index file containing the indices of the
        selected atoms.''')

    section2 = argument_group(description='Selection Criteria: Choose One')
    group2 = section2.add_mutually_exclusive_group(required=True)
    group2.add_argument('--minimal', action='store_true', help='''Keep the
        atoms in protein residues with names CA, CB, C, N, O, (recommended).''')
    group2.add_argument('--heavy', action='store_true', help='''All
        non-hydrogen atoms that are not symmetry equivalent. By symmetry
        equivalent, we mean atoms identical under an exchange of labels. For
        example, heavy will exclude the two pairs of equivalent carbons (CD,
        CE) in a PHE ring.''')
    group2.add_argument('--alpha', action='store_true', help='''Only alpha
        carbons.''')
    group2.add_argument('--all', action='store_true', help='''Selection
        includes every atom.''')

    def __init__(self, args):
        self.args = args
        if os.path.exists(args.out):
            self.error('IOError: file exists: %s' % args.out)
        self.pdb = md.load(os.path.expanduser(args.pdb))
        print('Loaded pdb containing (%d) chains, (%d) residues, (%d) atoms.' %
            (self.pdb.topology.n_chains, self.pdb.topology.n_residues,
             self.pdb.topology.n_atoms))

    def start(self):
        if self.args.all:
            atom_indices = np.arange(self.pdb.n_atoms)
        elif self.args.alpha:
            atom_indices = [a.index for a in self.pdb.topology.atoms if a.name == 'CA']
        elif self.args.minimal:
            atom_indices = [a.index for a in self.pdb.topology.atoms if a.name in
                ['CA', 'CB', 'C', 'N', 'O'] and a.residue.name in PROTEIN_RESIDUES]
        elif self.args.heavy:
            atom_indices = [a.index for a in self.pdb.topology.atoms if a.element != element.hydrogen
                and a.residue.name in PROTEIN_RESIDUES]
        else:
            raise RuntimeError

        print('Selected (%d) atoms from (%d) unique residues.' % (len(atom_indices),
            len(np.unique([self.pdb.topology.atom(i).residue.index for i in atom_indices]))))

        if self.args.distance_pairs:
            out = np.array(list(itertools.combinations(atom_indices, 2)))
        elif self.args.atoms:
            out = np.array(atom_indices)
        else:
            raise RuntimeError
        np.savetxt(self.args.out, out, '%d')