class SaveFeaturizer(Command, GaussianFeaturizationMixin): name = 'featurizer' description = '''Create and save a featurizer for later use.''' group_feature = argument_group('Featurizer Loading') group_feature.add_argument('--top', type=str, help='''Topology file for loading trajectories''', required=True) group_feature.add_argument( '-o', '--filename', type=str, help= '''Output featurizer to this filename. default='featurizer.pickl' ''', default='featurizer.pickl') def __init__(self, args): self.args = args if args.top is not None: self.top = md.load(os.path.expanduser(args.top)) else: self.top = None if args.distance_pairs is not None: self.indices = np.loadtxt(args.distance_pairs, dtype=int, ndmin=2) if self.indices.shape[1] != 2: self.error( 'distance-pairs must have shape (N, 2). %s had shape %s' % (args.distance_pairs, self.indices.shape)) featurizer = AtomPairsFeaturizer(self.indices, self.top) else: self.indices = np.loadtxt(args.atom_indices, dtype=int, ndmin=2) if self.indices.shape[1] != 1: self.error( 'atom-indices must have shape (N, 1). %s had shape %s' % (args.atom_indices, self.indices.shape)) self.indices = self.indices.reshape(-1) featurizer = SuperposeFeaturizer(self.indices, self.top) featurizer.save(args.filename) def start(self): args = self.args
class MDTrajInputMixin(object): """Mixin for a command to accept trajectory input files""" group_mdtraj = argument_group('MDTraj Options') group_mdtraj.add_argument('--dir', type=str, help='''Directory containing the trajectories to load''', required=True) group_mdtraj.add_argument('--top', type=str, help='''Topology file for loading trajectories''', required=True) group_mdtraj.add_argument( '--ext', help='File extension of the trajectories', required=True, choices=[e[1:] for e in md._FormatRegistry.loaders.keys()])
class GaussianFeaturizationMixin(object): group_munge = argument_group('Munging Options') group_vector = group_munge.add_mutually_exclusive_group(required=True) group_vector.add_argument('-d', '--distance-pairs', type=str, help='''Vectorize the MD trajectories by extracting timeseries of the distance between pairs of atoms in each frame. Supply a text file where each row contains the space-separate indices of two atoms which form a pair to monitor''') group_vector.add_argument('-a', '--atom-indices', type=str, help='''Superpose each MD conformation on the coordinates in the topology file, and then use the distance from each atom in the reference conformation to the corresponding atom in each MD conformation.''')
class FitVMHMM(Command, MDTrajInputMixin): name = 'fit-vmhmm' description = '''Fit von-Mises hidden Markov models with EM. The von Mises distribution, (also known as the circular normal distribution or Tikhonov distribution) is a continuous probability distribution on the circle. For multivariate signals, the emissions distribution implemented by this model is a product of univariate von Mises distributions -- analogous to the multivariate Gaussian distribution with a diagonal covariance matrix. Because the support of the base 1D distribution is on [-pi, pi), this model makes a suitable emission distribution for timeseries of angles (e.g. protein dihedral angles). ''' group_munge = argument_group('Munging Options') group_munge.add_argument( '-d', '--dihedral-indices', required=True, type=str, help='''Vectorize the MD trajectories by extracting timeseries of the dihedral (torsion) angles between sets of 4 atoms. Supply a text file where each row contains the space-separate indices of four atoms which form a dihedral angle to monitor. These indices are 0-based.''') group_hmm = argument_group('HMM Options') group_hmm.add_argument( '-k', '--n-states', action=MultipleIntAction, default=[2], help='Number of states in the models. Default = [2,]', nargs='+') group_hmm.add_argument('-l', '--lag-times', action=MultipleIntAction, default=[1], help='Lag time(s) of the model(s). Default = [1,]', nargs='+') # group_hmm.add_argument('--platform', choices=['cuda', 'cpu', 'sklearn'], # default='cpu', help='Implementation platform. default="cpu"') # group_hmm.add_argument('--fusion-prior', type=float, default=1e-2, # help='Strength of the adaptive fusion prior. default=1e-2') group_hmm.add_argument( '--n-em-iter', type=int, default=100, help='Maximum number of iterations of EM. default=100') group_hmm.add_argument( '--thresh', type=float, default=1e-2, help='''Convergence criterion for EM. Quit when the log likelihood decreases by less than this threshold. default=1e-2''') # group_hmm.add_argument('--n-lqa-iter', type=int, default=10, # help='''Max number of iterations for local quadradric approximation # solving the fusion-L1. default=10''') group_hmm.add_argument( '--reversible-type', choices=['mle', 'transpose'], default='mle', help='''Method by which the model is constrained to be reversible. default="mle"''') group_hmm.add_argument('-sp', '--split', type=int, help='''Split trajectories into smaller chunks. This looses some counts (i.e. like 1%% of the counts are lost with --split 100), but can help with speed (on gpu + multicore cpu) and numerical instabilities that come when trajectories get extremely long.''', default=10000) group_out = argument_group('Output') group_out.add_argument('-o', '--out', default='hmms.jsonlines', help='Output file. default="hmms.jsonlines"') def __init__(self, args): self.args = args self.top = md.load(args.top) if args.top is not None else None self.indices = np.loadtxt(args.dihedral_indices, dtype=int, ndmin=2) if self.indices.shape[1] != 4: self.error( 'dihedral-indices must have shape (N, 4). %s had shape %s' % (args.dihedral_indices, self.indices.shape)) self.filenames = glob.glob(args.dir + '/*.' + args.ext) self.n_features = self.indices.shape[0] def start(self): args = self.args data = self.load_data() with open(args.out, 'a', 0) as outfile: outfile.write('# %s\n' % ' '.join(sys.argv)) for lag_time in args.lag_times: subsampled = [d[::lag_time] for d in data] for n_states in args.n_states: self.fit(subsampled, n_states, lag_time, outfile) def fit(self, data, n_states, lag_time, outfile): model = VonMisesHMM(n_states=n_states, reversible_type=self.args.reversible_type, n_iter=self.args.n_em_iter, thresh=self.args.thresh) start = time.time() model.fit(data) end = time.time() result = { 'model': 'VonMisesHMM', 'timescales': (model.timescales_() * lag_time).tolist(), 'transmat': model.transmat_.tolist(), 'populations': model.populations_.tolist(), 'n_states': model.n_states, 'split': self.args.split, 'train_lag_time': lag_time, 'train_time': end - start, 'means': model.means_.tolist(), 'kappas': model.kappas_.tolist(), 'train_logprobs': model.fit_logprob_, 'n_train_observations': sum(len(t) for t in data), } if not np.all(np.isfinite(model.transmat_)): print('Nonfinite numbers in transmat !!') json.dump(result, outfile) outfile.write('\n') def load_data(self): load_time_start = time.time() data = [] for tfn in self.filenames: kwargs = {} if tfn.endswith('h5') else {'top': self.top} for t in md.iterload(tfn, chunk=self.args.split, **kwargs): item = np.asarray(md.compute_dihedrals(t, self.indices), np.double) data.append(item) return data
class FitGHMM(Command, MDTrajInputMixin): name = 'fit-ghmm' description = '''Fit L1-Regularized Reversible Gaussian hidden Markov models with EM.''' group_hmm = argument_group('HMM Options') group_hmm.add_argument('--featurizer', type=str, required=True, help='Path to saved featurizer object') group_hmm.add_argument('-k', '--n-states', action=MultipleIntAction, default=[2], help='Number of states in the models. Default = [2,]', nargs='+') group_hmm.add_argument('-l', '--lag-times', action=MultipleIntAction, default=[1], help='Lag time(s) of the model(s). Default = [1,]', nargs='+') group_hmm.add_argument('--platform', choices=['cuda', 'cpu', 'sklearn'], default='cpu', help='Implementation platform. default="cpu"') group_hmm.add_argument('--fusion-prior', type=float, default=1e-2, help='Strength of the adaptive fusion prior. default=1e-2') group_hmm.add_argument('--n-em-iter', type=int, default=100, help='Maximum number of iterations of EM. default=100') group_hmm.add_argument('--thresh', type=float, default=1e-2, help='''Convergence criterion for EM. Quit when the log likelihood decreases by less than this threshold. default=1e-2''') group_hmm.add_argument('--n-lqa-iter', type=int, default=10, help='''Max number of iterations for local quadradric approximation solving the fusion-L1. default=10''') group_hmm.add_argument('--reversible-type', choices=['mle', 'transpose'], default='mle', help='''Method by which the model is constrained to be reversible. default="mle"''') group_hmm.add_argument('-sp', '--split', type=int, help='''Split trajectories into smaller chunks. This looses some counts (i.e. like 1%% of the counts are lost with --split 100), but can help with speed (on gpu + multicore cpu) and numerical instabilities that come when trajectories get extremely long.''', default=10000) group_cv = argument_group('Cross Validation') group_cv.add_argument('--n-cv', type=int, default=1, help='Run N-fold cross validation. default=1') # We're training and testing at the same lag time for the moment # group_cv.add_argument('--test-lag-time', type=int, default=1, # help='Lag time at which to test the models. default=1') group_out = argument_group('Output') group_out.add_argument('-o', '--out', default='hmms.jsonlines', help='Output file. default="hmms.jsonlines"') def __init__(self, args): self.args = args if args.top is not None: self.top = md.load(os.path.expanduser(args.top)) else: self.top = None self.featurizer = mixtape.featurizer.load(args.featurizer) self.filenames = glob.glob(os.path.expanduser(args.dir) + '/*.' + args.ext) self.n_features = self.featurizer.n_features def start(self): args = self.args data = self.load_data() with open(args.out, 'a', 0) as outfile: outfile.write('# %s\n' % ' '.join(sys.argv)) for lag_time in args.lag_times: subsampled = [d[::lag_time] for d in data] for n_states in args.n_states: if args.n_cv > 1: for fold, (train_i, test_i) in enumerate(KFold(n=len(data), n_folds=args.n_cv)): train = [subsampled[i] for i in train_i] test = [subsampled[i] for i in test_i] self.fit(train, test, n_states, lag_time, fold, args, outfile) else: self.fit(subsampled, subsampled, n_states, lag_time, 0, args, outfile) def fit(self, train, test, n_states, train_lag_time, fold, args, outfile): kwargs = dict(n_states=n_states, n_features=self.n_features, n_em_iter=args.n_em_iter, n_lqa_iter = args.n_lqa_iter, fusion_prior=args.fusion_prior, thresh=args.thresh, reversible_type=args.reversible_type, platform=args.platform) print(kwargs) model = GaussianFusionHMM(**kwargs) start = time.time() model.fit(train) end = time.time() result = { 'model': 'GaussianFusionHMM', 'timescales': (np.real(model.timescales_()) * train_lag_time).tolist(), 'transmat': np.real(model.transmat_).tolist(), 'populations': np.real(model.populations_).tolist(), 'n_states': model.n_states, 'split': args.split, 'fusion_prior': args.fusion_prior, 'train_lag_time': train_lag_time, 'train_time': end - start, 'means': np.real(model.means_).tolist(), 'vars': np.real(model.vars_).tolist(), 'train_logprob': model.fit_logprob_[-1], 'n_train_observations': sum(len(t) for t in train), 'n_test_observations': sum(len(t) for t in test), 'train_logprobs': model.fit_logprob_, #'test_lag_time': args.test_lag_time, 'cross_validation_fold': fold, 'cross_validation_nfolds': args.n_cv, } # model.transmat_ = contraction(model.transmat_, float(train_lag_time) / float(args.test_lag_time)) # Don't do any contraction -- train and test at the same lagtime result['test_logprob'] = model.score(test) result['test_lag_time'] = train_lag_time if not np.all(np.isfinite(model.transmat_)): print('Nonfinite numbers in transmat !!') json.dump(result, outfile) outfile.write('\n') def load_data(self): load_time_start = time.time() data = [] for tfn in self.filenames: kwargs = {} if tfn.endswith('h5') else {'top': self.top} for t in md.iterload(tfn, chunk=self.args.split, **kwargs): features = self.featurizer.featurize(t) data.append(features) print('Loading data into memory + vectorization: %f s' % (time.time() - load_time_start)) print('Fitting with %s timeseries from %d trajectories with %d total observations' % ( len(data), len(self.filenames), sum(len(e) for e in data))) return data
class SampleGHMM(Command, MDTrajInputMixin): name = 'sample-ghmm' description = '''Draw iid samples from each state in a Gaussian HMM. The output is a small CSV file with 3 columns: 'filename', 'index', and 'state'. Each row gives the path to a trajectory file, the index of a single frame therein, and the state it was drawn from. The sampling strategy is as follows: for each state represented by a Gaussian distribution, we create a discrete distribution over the featurized frames in the specified trajectory files such that the discrete distribution has the same mean (and optionally variance) as the state Gaussian distribution and minimizes the K-L divergence from the discrete distribution to the continuous Gaussian it's trying to model. Then, we sample from that discrete distribution and return the corresponding frames in a CSV file. The reason for this complexity is that the Gaussian distributions for each state are continuous distributions over the featurized space. To visualize the structures corresponding to each state, however, we would need to sample from this distribution and then "invert" the featurization, to reconstruct the cartesian coordinates for our samples. Alternatively, we can draw from a discrete distribution over our available structures; but this introduces the question of what discrete distribution "optimally" represents the continuous (Gaussian) distribution of interest. [Reference]: Tanaka, Ken'ichiro, and Alexis Akira Toda. "Discrete approximations of continuous distributions by maximum entropy." Economics Letters 118.3 (2013): 445-450. ''' group = argument_group('I/O Arguments') group.add_argument('-i', '--filename', required=True, metavar='JSONLINES_FILE', help='''Path to the jsonlines output file containg the HMMs''') group.add_argument('--featurizer', type=str, required=True, help='Path to saved featurizer object') group.add_argument('--n-states', type=int, required=True, help='''Number of states in the model to select from''') group.add_argument('--n-per-state', type=int, default=3, help='''Number of structures to pull from each state''') group.add_argument('--lag-time', type=int, required=True, help='''Training lag time of the model to select from''') group.add_argument('-o', '--out', metavar='OUTPUT_CSV_FILE', help='File to which to save the output, in CSV format. default="samples.csv"', default='samples.csv') match_vars = argument('--match-vars', action=FlagAction, default=True, help='''Constrain the discrete distribution to match the variances of the continuous distribution. default=enabled''') def __init__(self, args): if os.path.exists(args.out): self.error('IOError: file exists: %s' % args.out) matches = [o for o in iterobjects(args.filename) if o['n_states'] == args.n_states and o['train_lag_time'] == args.lag_time] if len(matches) == 0: self.error('No model with n_states=%d, train_lag_time=%d in %s.' % ( args.n_states, args.lag_time, args.filename)) self.args = args self.model = matches[0] self.out = args.out self.topology = md.load(args.top) self.filenames = glob.glob(os.path.join(os.path.expanduser(args.dir), '*.%s' % args.ext)) self.featurizer = mixtape.featurizer.load(args.featurizer) self.match_vars = args.match_vars if len(self.filenames) == 0: self.error('No files matched.') def start(self): print('loading all data...') xx, ii, ff = mixtape.featurizer.featurize_all(self.filenames, self.featurizer, self.topology) print('done loading') data = {'filename': [], 'index': [], 'state': []} for k in range(self.model['n_states']): print('computing weights for k=%d...' % k) try: weights = discrete_approx_mvn(xx, self.model['means'][k], self.model['vars'][k], self.match_vars) except NotSatisfiableError: self.error('Satisfiability failure. Could not match the means & ' 'variances w/ discrete distribution. Try removing the ' 'constraint on the variances with --no-match-vars?') cumsum = np.cumsum(weights) for i in range(self.args.n_per_state): index = np.sum(cumsum < np.random.rand()) data['filename'].append(ff[index]) data['index'].append(ii[index]) data['state'].append(k) df = pd.DataFrame(data) print('Saving the indices of the sampled states in CSV format to %s' % self.out) with open(self.out, 'w') as f: f.write("# command: %s\n" % ' '.join(sys.argv)) df.to_csv(f)
class PullMeansGHMM(SampleGHMM): name = 'means-ghmm' description = '''Draw samples at the center of each state in a Gaussian HMM.''' group = argument_group('I/O Arguments') group.add_argument( '-i', '--filename', required=True, metavar='JSONLINES_FILE', help='''Path to the jsonlines output file containg the HMMs''') group.add_argument('--featurizer', type=str, required=True, help='Path to saved featurizer object') group.add_argument('--n-states', type=int, required=True, help='''Number of states in the model to select from''') group.add_argument('--n-per-state', type=int, default=1, help='''Select the `n-per-state` most representative structures from each state. default=1''' ) group.add_argument('--lag-time', type=int, required=True, help='''Training lag time of the model to select from''') group.add_argument( '-o', '--out', metavar='OUTPUT_CSV_FILE', help= 'File to which to save the output, in CSV format. default="means.csv', default='means.csv') def start(self): featurizer = mixtape.featurizer.load(self.args.featurizer) features, ii, ff = mixtape.featurizer.featurize_all( self.filenames, featurizer, self.topology) logprob = log_multivariate_normal_density(features, np.array( self.model['means']), np.array(self.model['vars']), covariance_type='diag') assignments = np.argmax(logprob, axis=1) probs = np.max(logprob, axis=1) data = {'filename': [], 'index': [], 'state': []} for k in range(self.model['n_states']): # pick the structures that have the highest log # probability in the state p = probs[assignments == k] sorted_filenms = ff[assignments == k][p.argsort()] sorted_indices = ii[assignments == k][p.argsort()] if len(p) > 0: data['index'].extend(sorted_indices[-self.args.n_per_state:]) data['filename'].extend( sorted_filenms[-self.args.n_per_state:]) data['state'].extend([k] * self.args.n_per_state) else: print('WARNING: NO STRUCTURES ASSIGNED TO STATE=%d' % k) df = pd.DataFrame(data) print('Saving the indices of the selected frames in CSV format to %s' % self.out) with open(self.out, 'w') as f: f.write("# command: %s\n" % ' '.join(sys.argv)) df.to_csv(f)
class DihedralIndices(Command): description = "Create index file for dihedral angles." pdb = argument('-p', '--pdb', required=True, help='Path to PDB file') out = argument('-o', '--out', required=True, help='Path to output file') section2 = argument_group( description='Selection Criteria: Choose One or More') section2.add_argument('--phi', action='store_true', help='''Backbone phi (C-N-CA-C) angles''') section2.add_argument('--psi', action='store_true', help='''Backbone psi (N-CA-C-N) angles''') section2.add_argument('--omega', action='store_true', help='''Backbone omega (CA-C-N-CA) angles''') section2.add_argument('--chi1', action='store_true', help='''Chi1 is the first side chain torsion angle formed between the 4 atoms over the CA-CB axis.''') section2.add_argument('--chi2', action='store_true', help='''Chi2 is the second side chain torsion angle formed between the corresponding 4 atoms over the CB-CG axis.''') section2.add_argument('--chi3', action='store_true', help='''Chi3 is the third side chain torsion angle formed between the corresponding 4 atoms over the CG-CD axis (only the residues ARG, GLN, GLU, LYS & MET have these atoms)''') section2.add_argument('--chi4', action='store_true', help='''Chi4 is the fourth side chain torsion angle formed between the corresponding 4 atoms over the CD-CE or CD-NE axis (only ARG & LYS residues have these atoms)''') def __init__(self, args): self.args = args if os.path.exists(args.out): self.error('IOError: file exists: %s' % args.out) self.pdb = md.load(args.pdb) print('Loaded pdb containing (%d) chains, (%d) residues, (%d) atoms.' % (self.pdb.topology.n_chains, self.pdb.topology.n_residues, self.pdb.topology.n_atoms)) def start(self): dihedral_atom_types = [] if self.args.phi: dihedral_atom_types.append(PHI_ATOMS) if self.args.psi: dihedral_atom_types.append(PSI_ATOMS) if self.args.omega: dihedral_atom_types.append(OMEGA_ATOMS) if self.args.chi1: dihedral_atom_types.extend(CHI1_ATOMS) if self.args.chi2: dihedral_atom_types.extend(CHI2_ATOMS) if self.args.chi3: dihedral_atom_types.extend(CHI3_ATOMS) if self.args.chi4: dihedral_atom_types.extend(CHI4_ATOMS) rids, indices = zip(*(_atom_sequence(self.pdb, atoms) for atoms in dihedral_atom_types)) rids = np.concatenate(rids) id_sort = np.argsort(rids) if not any(x.size for x in indices): self.error('No dihedral angles matched.') indices = np.vstack(x for x in indices if x.size)[id_sort] print('Selected (%d) dihedrals from (%d) unique residues.' % (len(indices), len(np.unique(rids)))) np.savetxt(self.args.out, indices, '%d')
class AtomIndices(Command): description="Create index file for atoms or distance pairs." pdb = argument('-p', '--pdb', required=True, help='Path to PDB file') out = argument('-o', '--out', required=True, help='Path to output file') section1 = argument_group(description='Mode: Choose One') group1 = section1.add_mutually_exclusive_group(required=True) group1.add_argument('-d', '--distance-pairs', action='store_true', help='''Create a 2-dimensional index file with (N choose 2) rows and 2 columns, where each row specifies a pair of indices. All (N choose 2) pairs of the selected atoms will be written.''') group1.add_argument('-a', '--atoms', action='store_true', help='''Create a 1-dimensional index file containing the indices of the selected atoms.''') section2 = argument_group(description='Selection Criteria: Choose One') group2 = section2.add_mutually_exclusive_group(required=True) group2.add_argument('--minimal', action='store_true', help='''Keep the atoms in protein residues with names CA, CB, C, N, O, (recommended).''') group2.add_argument('--heavy', action='store_true', help='''All non-hydrogen atoms that are not symmetry equivalent. By symmetry equivalent, we mean atoms identical under an exchange of labels. For example, heavy will exclude the two pairs of equivalent carbons (CD, CE) in a PHE ring.''') group2.add_argument('--alpha', action='store_true', help='''Only alpha carbons.''') group2.add_argument('--all', action='store_true', help='''Selection includes every atom.''') def __init__(self, args): self.args = args if os.path.exists(args.out): self.error('IOError: file exists: %s' % args.out) self.pdb = md.load(os.path.expanduser(args.pdb)) print('Loaded pdb containing (%d) chains, (%d) residues, (%d) atoms.' % (self.pdb.topology.n_chains, self.pdb.topology.n_residues, self.pdb.topology.n_atoms)) def start(self): if self.args.all: atom_indices = np.arange(self.pdb.n_atoms) elif self.args.alpha: atom_indices = [a.index for a in self.pdb.topology.atoms if a.name == 'CA'] elif self.args.minimal: atom_indices = [a.index for a in self.pdb.topology.atoms if a.name in ['CA', 'CB', 'C', 'N', 'O'] and a.residue.name in PROTEIN_RESIDUES] elif self.args.heavy: atom_indices = [a.index for a in self.pdb.topology.atoms if a.element != element.hydrogen and a.residue.name in PROTEIN_RESIDUES] else: raise RuntimeError print('Selected (%d) atoms from (%d) unique residues.' % (len(atom_indices), len(np.unique([self.pdb.topology.atom(i).residue.index for i in atom_indices])))) if self.args.distance_pairs: out = np.array(list(itertools.combinations(atom_indices, 2))) elif self.args.atoms: out = np.array(atom_indices) else: raise RuntimeError np.savetxt(self.args.out, out, '%d')