def test_score_2(): X = np.random.randn(100, 5) Y = np.random.randn(100, 5) model = tICA(shrinkage=0.0, n_components=2).fit([X]) s1 = model.score([Y]) s2 = tICA(shrinkage=0.0).fit(model.transform([Y])).eigenvalues_.sum() eq(s1, s2)
def test_kinetic_mapping(): np.random.seed(42) X = np.random.randn(10, 3) tica1 = tICA(n_components=2, lag_time=1) tica2 = tICA(n_components=2, lag_time=1, kinetic_mapping=True) y1 = tica1.fit_transform([np.copy(X)])[0] y2 = tica2.fit_transform([np.copy(X)])[0] assert eq(y2, y1*tica1.eigenvalues_)
def landmark_ktica_ticaTraj(tica_dir, clusterer_dir, ktica_dir, clusters_map_file = "", landmarks_dir = "", nystroem_components=1000, n_components=10, lag_time=5, nystroem_data_filename = "", fit_model_filename = "", projected_data_filename = "", landmark_subsample=1, sparse = False, wolf = True, rho = 0.01, shrinkage = None): if not os.path.exists(ktica_dir): os.makedirs(ktica_dir) if not sparse: if shrinkage is None: tica_model = tICA(n_components = n_components, lag_time = lag_time) else: tica_model = tICA(n_components = n_components, lag_time = lag_time, shrinkage = shrinkage) else: if shrinkage is None: tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho) else: tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho, shrinkage = shrinkage) if not os.path.exists(nystroem_data_filename): clusterer = verboseload(clusterer_dir) tica = verboseload(tica_dir) features = tica clusters = clusterer.cluster_centers_ landmarks = clusters print("here's what goes into the combined class:") #print(np.shape(features)) print(np.shape(landmarks)) print(type(landmarks)) nys = Nystroem(n_components = np.shape(landmarks)[0], basis = landmarks)#np.shape(landmarks)[0])# basis=landmarks) nyx = nys.fit_transform(features) del features del landmarks try: save_dataset(nyx, nystroem_data_filename) except: os.system("rm -rf %s" %nystroem_data_filename) save_dataset(nyx, nystroem_data_filename) else: nyx = load_dataset(nystroem_data_filename) print(np.shape(nyx)) print(dir(nyx)) if not os.path.exists(projected_data_filename): fit_model = tica_model.fit(nyx) verbosedump(fit_model, fit_model_filename) transformed_data = fit_model.transform(nyx) del(nyx) try: save_dataset(transformed_data, projected_data_filename) except: os.system("rm -rf %s" %projected_data_filename) save_dataset(transformed_data, projected_data_filename) else: print("Already performed landmark kernel tICA.")
def test_subsampler_tica(): n_traj, n_samples, n_features = 1, 500, 4 lag_time = 2 X_all_0 = [random.normal(size=(n_samples, n_features)) for i in range(n_traj)] tica_0 = tICA(lag_time=lag_time) tica_0.fit(X_all_0) subsampler = Subsampler(lag_time=lag_time) tica_1 = tICA() pipeline = sklearn.pipeline.Pipeline([("subsampler", subsampler), ('tica', tica_1)]) pipeline.fit(X_all_0) eq(tica_0.n_features, tica_1.n_features) # Obviously true eq(tica_0.n_observations_, tica_1.n_observations_) eq(tica_0.eigenvalues_, tica_1.eigenvalues_) # The eigenvalues should be the same. NOT the timescales, as tica_1 has timescales calculated in a different time unit
def fit_and_transform(features_directory, model_dir, stride=5, lag_time=10, n_components = 5): if not os.path.exists(model_dir): os.makedirs(model_dir) projected_data_filename = "%s/phi_psi_chi2_allprot_projected.h5" %model_dir fit_model_filename = "%s/phi_psi_chi2_allprot_tica_coords.h5" %model_dir #active_pdb_file = "/scratch/users/enf/b2ar_analysis/renamed_topologies/A-00.pdb" tica_model = tICA(n_components = n_components, lag_time = lag_time) if not os.path.exists(projected_data_filename): print("loading feature files") feature_files = get_trajectory_files(features_directory, ext = ".h5") pool = mp.Pool(mp.cpu_count()) features = pool.map(load_features, feature_files) pool.terminate() if not os.path.exists(fit_model_filename): print("fitting data to tICA model") fit_model = tica_model.fit(features) verbosedump(fit_model, fit_model_filename) transformed_data = fit_model.transform(features) verbosedump(transformed_data, projected_data_filename) else: print("loading tICA model") fit_model = verboseload(fit_model_filename) print("transforming") transformed_data = fit_model.transform(features) verbosedump(transformed_data, projected_data_filename) else: fit_model = verboseload(fit_model_filename) transformed_data = verboseload(projected_data_filename) print fit_model.summarize()
def test_MetEnkephalin(): np.random.seed(0) data = build_dataset() n_features = data[0].shape[1] # check whether this recovers a single 1-sparse eigenpair without error kstica = KSparseTICA(n_components=1, k=1) _ = kstica.fit_transform(data) assert (np.sum(kstica.components_ != 0) == 1) ## check whether this recovers >1 eigenpair without error #kstica = KSparseTICA(n_components=2) #_ = kstica.fit_transform(data) ## check whether this recovers all eigenpairs without error #kstica = KSparseTICA() #_ = kstica.fit_transform(data) # check whether we recover the same solution as standard tICA when k = n_features n_components = 10 kstica = KSparseTICA(n_components=n_components, k=n_features) tica = tICA(n_components=n_components) _ = kstica.fit_transform(data) _ = tica.fit_transform(data) np.testing.assert_array_almost_equal(kstica.eigenvalues_, tica.eigenvalues_)
def test_MetEnkephalin(): np.random.seed(0) data = build_dataset() n_features = data[0].shape[1] # check whether this recovers a single 1-sparse eigenpair without error kstica = KSparseTICA(n_components=1, k = 1) _ = kstica.fit_transform(data) assert (np.sum(kstica.components_ != 0) == 1) ## check whether this recovers >1 eigenpair without error #kstica = KSparseTICA(n_components=2) #_ = kstica.fit_transform(data) ## check whether this recovers all eigenpairs without error #kstica = KSparseTICA() #_ = kstica.fit_transform(data) # check whether we recover the same solution as standard tICA when k = n_features n_components = 10 kstica = KSparseTICA(n_components=n_components, k=n_features) tica = tICA(n_components=n_components) _ = kstica.fit_transform(data) _ = tica.fit_transform(data) np.testing.assert_array_almost_equal(kstica.eigenvalues_, tica.eigenvalues_)
def __init__(self, args): from msmbuilder.decomposition import tICA if args.lag_time <= 0: self.error('offset must be greater than or equal to zero') self.args = args self.model = tICA(n_components=2, lag_time=self.args.lag_time) self.labels = [b'tIC1', b'tIC2']
def fit_protein_tica(yaml_file,sparse=False): mdl_dir = yaml_file["mdl_dir"] mdl_params = yaml_file["mdl_params"] current_mdl_params={} for i in mdl_params.keys(): if i.startswith("tica__"): current_mdl_params[i.split("tica__")[1]] = mdl_params[i] if sparse==True: protein_tica_mdl = SparseTICA(**current_mdl_params) else: protein_tica_mdl = tICA(**current_mdl_params) for protein in yaml_file["protein_list"]: print("Fitting to protein %s" % protein) with enter_protein_data_dir(yaml_file, protein): featurized_traj = sorted(glob.glob("./%s/*.jl" % yaml_file["feature_dir"]), key=keynat) for f in featurized_traj: featurized_path = verboseload(f) try: protein_tica_mdl.partial_fit(featurized_path) except: pass print("Done partial fitting to protein %s" % protein) # dumping the tica_mdl tica_mdl_path = os.path.join(mdl_dir, "tica_mdl.pkl") verbosedump(protein_tica_mdl, tica_mdl_path) return
def ktica_test(features_dir, tica_dir, landmark_indices=None, nystroem_components=1000, tica_components=10, lag_time=5, nystroem_data_filename="", fit_model_filename="", projected_data_filename=""): nys = Nystroem(n_components=nystroem_components) tica_model = tICA(n_components=tica_components, lag_time=lag_time) feature_files = get_trajectory_files(features_dir, ext=".h5")[0:3] #if os.path.exists(nystroem_data_filename): # nyx = verboseload(nystroem_data_filename) #else: features = load_file_list(feature_files) nyx = nys.fit_transform(features) verbosedump(nyx, nystroem_data_filename) print((np.shape(nyx))) print((dir(nyx))) fit_model = tica_model.fit(nyx) verbosedump(fit_model, fit_model_filename) transformed_data = fit_model.transform(nyx) verbosedump(transformed_data, projected_data_filename) return
def make_tica_opt(trajectories_t, timeskip_t): # frameskip = 0 shortest_length = len(trajectories_t[0]) for i in range(len(trajectories_t)): # if shortest_length > len(trajectories_t[i]): # shortest_length = len(trajectories_t[i]) # # frameskip = int(shortest_length/2) print("Frameskip {:f} ns ({:d} frames) :".format(float(frameskip)*timeskip_t/1000.0, frameskip)) tica_tot_t = tICA(n_components = len(trajectories_t[0][0]), lag_time = frameskip) tica_tot_t.fit(trajectories_t) usable_comps_t = get_smallest_tscale(tica_tot_t) equil_t, equil_dists_t = in_equil(tica_tot_t, usable_comps_t) n_comp_t = find_components(tica_tot_t, usable_comps_t) return tica_tot_t, equil_t, equil_dists_t, n_comp_t, usable_comps_t, frameskip
def test_plot_decomp_grid(self): from msmbuilder.decomposition import tICA tica = tICA(n_components=2).fit([data]) ax = plot_decomp_grid(tica, xlim=(0., 1.), ylim=(0., 1.)) assert isinstance(ax, SubplotBase)
def stepwise_analysis(counts_fns, out_fn_prefix, lag_time): # Load data from tables seqs_3d_unorm = [] shell_w = -1 for counts_fn in counts_fns: h = tables.open_file(counts_fn) seq = h.root.shell_counts[:] shell_w, = h.root.shell_width[:] seqs_3d_unorm.append(seq) h.close() pickle_save(seqs_3d_unorm, "{}.3d.unnorm.pickl".format(out_fn_prefix)) # Normalize seqs_3d_norm = [normalize(fp3d, shell_w) for fp3d in seqs_3d_unorm] del seqs_3d_unorm pickle_save(seqs_3d_norm, '{}.3d.norm.pickl'.format(out_fn_prefix)) # Flatten seqs_2d_uprune = [reshape(fp3d) for fp3d in seqs_3d_norm] del seqs_3d_norm pickle_save(seqs_2d_uprune, '{}.2d.uprune.pickl'.format(out_fn_prefix)) # Prune low variance seqs_2d_prune, deleted = prune_all(seqs_2d_uprune) del seqs_2d_uprune pickle_save(seqs_2d_prune, '{}.2d.prune.pickl'.format(out_fn_prefix)) pickle_save(deleted, '{}.deleted.pickl'.format(out_fn_prefix)) # Fit tICA tica = tICA(n_components=10, lag_time=lag_time, weighted_transform=True) ticax = tica.fit_transform(seqs_2d_prune) del seqs_2d_prune pickle_save(tica, '{}.tica.pickl'.format(out_fn_prefix)) pickle_save(ticax, '{}.ticax.pickl'.format(out_fn_prefix))
def make_tica_opt(trajectories_t, timeskip_t): # frameskip = 0 shortest_length = len(trajectories_t[0]) for i in range(len(trajectories_t)): # if shortest_length > len(trajectories_t[i]): # shortest_length = len(trajectories_t[i]) # # frameskip = int(shortest_length / 2) print("Frameskip {:f} ns ({:d} frames) :".format( float(frameskip) * timeskip_t / 1000.0, frameskip)) tica_tot_t = tICA(n_components=len(trajectories_t[0][0]), lag_time=frameskip) tica_tot_t.fit(trajectories_t) usable_comps_t = get_smallest_tscale(tica_tot_t) equil_t, equil_dists_t = in_equil(tica_tot_t, usable_comps_t) n_comp_t = find_components(tica_tot_t, usable_comps_t) return tica_tot_t, equil_t, equil_dists_t, n_comp_t, usable_comps_t, frameskip
def decompose_features(features, decomposer, n_components=None, lag_time=1): ''' Decomposing features is a way to reduce the dimension of the features. Each of the components is a eigenvector of the feature space, dimension: (n_features,) The old features are transformed to the new feature space. Consider one sample, which is vectorized to (n_features,).T, apply the transform matrix, which is in the shape (n_components, n_features), we will get its projection onto the new space (n_components,). -------------------------------------------------------------------------------------------------------------------------------------- Input features : array-like, length n_trajs, each of shape (n_samples, n_features) Output features_new : array-like, length n_trajs, each of shape (n_samples, n_components) ((n_samples, n_samples) if n_components = None) dcmp.components_ : shape (n_components, n_features), ((n_samples, n_features) if n_components = None) PCA : Principal axes in feature space, representing the directions of maximum variance in the data. tICA : Components with maximum autocorrelation. ''' if decomposer == 'PCA': from msmbuilder.decomposition import PCA dcmp = PCA(n_components=n_components) elif decomposer == 'tICA': from msmbuilder.decomposition import tICA dcmp = tICA(n_components=n_components, lag_time=lag_time) features_new = dcmp.fit_transform(features) return features_new, dcmp.components_
def ktica(features, landmarks, projected_data_filename, nystroem_data_filename, fit_model_filename, sparse = False, shrinkage = 0.05, wolf = True, rho = 0.01): if not sparse: if shrinkage is None: tica_model = tICA(n_components = n_components, lag_time = lag_time) else: if wolf: tica_model = tICA(n_components = n_components, lag_time = lag_time, shrinkage = shrinkage) else: tica_model = tICA(n_components = n_components, lag_time = lag_time, gamma = shrinkage) else: if shrinkage is None: tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho) else: tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho, shrinkage = shrinkage) if not os.path.exists(nystroem_data_filename): nys = Nystroem(n_components = np.shape(landmarks)[0], basis = landmarks)#np.shape(landmarks)[0])# basis=landmarks) nyx = nys.fit_transform(features) print("Computed Nystroem.") del features del landmarks try: save_dataset(nyx, nystroem_data_filename) except: os.system("rm -rf %s" %nystroem_data_filename) save_dataset(nyx, nystroem_data_filename) else: nyx = load_dataset(nystroem_data_filename) print("Loaded Nystroem") if not os.path.exists(projected_data_filename): fit_model = tica_model.fit(nyx) verbosedump(fit_model, fit_model_filename) transformed_data = fit_model.transform(nyx) del(nyx) try: save_dataset(transformed_data, projected_data_filename) except: os.system("rm -rf %s" %projected_data_filename) save_dataset(transformed_data, projected_data_filename) else: print("Already performed landmark kernel tICA.")
def test_score_1(): X = np.random.randn(100, 5) for n in range(1, 5): tica = tICA(n_components=n, gamma=0) tica.fit([X]) assert_approx_equal(tica.score([X]), tica.eigenvalues_.sum()) X2 = np.random.randn(100, 5) assert tica.score([X2]) < tica.score([X]) assert_approx_equal(tica.score([X]), tica.score_)
def test_1(): data = build_dataset() tica = tICA(n_components=1).fit(data) tic0 = tica.components_[0] print('tICA\n', tic0) stica = SparseTICA(n_components=1, verbose=True).fit(data) stic0 = stica.components_[0] print('Sparse tICA\n', stic0) assert np.allclose(stic0, [1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
def test_doublewell(): data = build_dataset() tica = tICA(n_components=1).fit(data) tic0 = tica.components_[0] stica = SparseTICA(n_components=1, verbose=False).fit(data) stic0 = stica.components_[0] np.testing.assert_array_almost_equal(stic0[1:], np.zeros(9)) np.testing.assert_almost_equal(stic0[0], 0.58, decimal=1)
def test_singular_1(): tica = tICA(n_components=1) # make some data that has one column repeated twice X = np.random.randn(100, 2) X = np.hstack((X, X[:, 0, np.newaxis])) tica.fit([X]) assert tica.components_.dtype == np.float64 assert tica.eigenvalues_.dtype == np.float64
def test_singular_2(): tica = tICA(n_components=1) # make some data that has one column of all zeros X = np.random.randn(100, 2) X = np.hstack((X, np.zeros((100, 1)))) tica.fit([X]) assert tica.components_.dtype == np.float64 assert tica.eigenvalues_.dtype == np.float64
def test_singular_1(): tica = tICA(n_components=1) # make some data that has one column repeated twice X = np.random.randn(100, 2) X = np.hstack((X, X[:,0, np.newaxis])) tica.fit([X]) assert tica.components_.dtype == np.float64 assert tica.eigenvalues_.dtype == np.float64
def test_subsampler_tica(): n_traj, n_samples, n_features = 1, 500, 4 lag_time = 2 X_all_0 = [ random.normal(size=(n_samples, n_features)) for i in range(n_traj) ] tica_0 = tICA(lag_time=lag_time) tica_0.fit(X_all_0) subsampler = Subsampler(lag_time=lag_time) tica_1 = tICA() pipeline = sklearn.pipeline.Pipeline([("subsampler", subsampler), ('tica', tica_1)]) pipeline.fit(X_all_0) eq(tica_0.n_features, tica_1.n_features) # Obviously true eq(tica_0.n_observations_, tica_1.n_observations_) eq( tica_0.eigenvalues_, tica_1.eigenvalues_ ) # The eigenvalues should be the same. NOT the timescales, as tica_1 has timescales calculated in a different time unit
def test_sample_dimension(): np.random.seed(42) X = np.random.randn(500, 5) data = [X, X, X] tica = tICA(n_components=2, lag_time=1).fit(data) tica_trajs = {k: tica.partial_transform(v) for k, v in enumerate(data)} res = sample_dimension(tica_trajs, 0, 10, scheme="linear") res2 = sample_dimension(tica_trajs, 1, 10, scheme="linear") assert len(res) == len(res2) == 10
def test_score_1(): X = np.random.randn(100, 5) for n in range(1, 5): tica = tICA(n_components=n, gamma=0) tica.fit([X]) assert_approx_equal( tica.score([X]), tica.eigenvalues_.sum()) X2 = np.random.randn(100, 5) assert tica.score([X2]) < tica.score([X]) assert_approx_equal(tica.score([X]), tica.score_)
def get_pipeline(parameters): """ Wrapper so that new instance of a pipeline can be instantiated for every fold. :return: sklean.pipeline.Pipeline object """ pipe = Pipeline([('variance_cut', VarianceThreshold()), ('tica', tICA(kinetic_mapping=True)), ('cluster', MiniBatchKMeans()), ('msm', MarkovStateModel(use_gap='timescales', lag_time=50, verbose=True))]) pipe.set_params(**parameters) return pipe
def tica_wrapper(proj_folder, feature_dict, lag_time=10): #100ps*100==10ns and 10 features if os.path.exists(proj_folder + "/tica_features.pkl"): return verboseload(proj_folder + "/tica_features.pkl") tica_mdl = tICA(lag_time=lag_time, n_components=10) tica_mdl.fit([feature_dict[i] for i in feature_dict.keys()]) tica_features = {} for i in feature_dict.keys(): tica_features[i] = tica_mdl.transform([feature_dict[i]])[0] verbosedump(tica_features, proj_folder + "/tica_features.pkl") return tica_features
def tica_wrapper(proj_folder,feature_dict,lag_time=10): #100ps*100==10ns and 10 features if os.path.exists(proj_folder+"/tica_features.pkl"): return verboseload(proj_folder+"/tica_features.pkl") tica_mdl = tICA(lag_time=lag_time,n_components=10) tica_mdl.fit([feature_dict[i] for i in feature_dict.keys()]) tica_features={} for i in feature_dict.keys(): tica_features[i] = tica_mdl.transform([feature_dict[i]])[0] verbosedump(tica_features,proj_folder+"/tica_features.pkl") return tica_features
def decompose_features(features, decomposer, n_components=None, lag_time=1): ''' Input features : list of arrays, length n_trajs, each of shape (n_samples, n_features) Output features_new : list of arrays, length n_trajs, each of shape (n_samples, n_features_new) ''' if decomposer == 'PCA': from msmbuilder.decomposition import PCA dcmp = PCA(n_components=n_components) elif decomposer == 'tICA': from msmbuilder.decomposition import tICA dcmp = tICA(n_components=n_components, lag_time=lag_time) return dcmp.fit_transform(features)
def build_model(self, user_defined_model): """ Load or build a model (Pipeline from scikit-learn) to do all the transforming and fitting :param user_defined_model: Either a string (to load from disk) or a Pipeline object to use as model :return model: Return the model back """ if user_defined_model is None: if os.path.exists(self.model_pkl_fname): logger.info('Loading model pkl file {}'.format( self.model_pkl_fname)) model = load_generic(self.model_pkl_fname) else: logger.info('Building default model based on dihedrals') # build a lag time of 1 ns for tICA and msm # if the stride is too big and we can't do that # use 1 frame and report how much that is in ns if self.app.meta is not None: lag_time = max(1, int(1 / self.timestep)) logger.info( 'Using a lag time of {} ns for the tICA and MSM'. format(lag_time * self.timestep)) else: self.timestep = None lag_time = 1 logger.warning( 'Cannot determine timestep. Defaulting to 1 frame.'. format(lag_time)) model = Pipeline([('feat', DihedralFeaturizer()), ('scaler', RobustScaler()), ('tICA', tICA(lag_time=lag_time, commute_mapping=True, n_components=10)), ('clusterer', MiniBatchKMeans(n_clusters=200)), ('msm', MarkovStateModel(lag_time=lag_time, ergodic_cutoff='off', reversible_type=None))]) else: if not isinstance(user_defined_model, Pipeline): raise ValueError( 'model is not an sklearn.pipeline.Pipeline object') else: logger.info('Using user defined model') model = user_defined_model return model
def generate_tics(self, featurized): """ Now tracks tica object and partially fits on it to speed up this step a lot by only adding new data rather than re-fitting each time. reduced dataset Returns: tica'd dataset """ if os.path.isfile( os.path.join(self.dir, "tICA_%d.h5" % self.generation)): ticr = utils.load_tica_h5( os.path.join(self.dir, "tICA_%d.h5" % self.generation)) elif os.path.isfile(os.path.join(self.dir, "tICA.pkl")): # legacy ticr = utils.load(os.path.join(self.dir, "tICA.pkl")) else: ticr = tICA(n_components=self.config.getint("model", "num_tics"), lag_time=self.config.getint("model", "tica_lag")) for newfeat in featurized: ticr.partial_fit(newfeat) utils.save_tica_h5( ticr, os.path.join(self.dir, "tICA_%d.h5" % self.generation)) # Now apply tica to the whole feature set. # We need to do this to all featurized data again since the tics # have changed since we just updated them with new data # Do one at a time to save memory. ticad = [] for gen in range(1, self.generation): if os.path.isfile("%s.h5" % self.featurized % gen): feated = utils.load_features_h5("%s.h5" % self.featurized % gen) else: feated = utils.load("%s.pkl" % self.featurized % gen) ticad.extend(ticr.transform(feated)) # Add the features we have in memory now ticad.extend(ticr.transform(featurized)) utils.save_features_h5(ticad, "ticad_%d.h5" % self.generation) return ticad
def setUp(self): numpy.random.seed(12) self.top = 'data_app/runs/structure.prmtop' self.traj_1 = 'data_app/runs/run-000.nc' self.traj_2 = 'data_app/runs/run-001.nc' self.feat = DihedralFeaturizer() self.traj_dict = { 0: load(self.traj_1, top=self.top), 1: load(self.traj_2, top=self.top) } self.scaler = RobustScaler() self.tica = tICA(n_components=2) self.ftrajs = { 0: numpy.random.rand(100, 50), 1: numpy.random.rand(100, 50), }
def featurizeData(xyz, tica_dim): featurizer = DihedralFeaturizer(types=['phi', 'psi']) if os.path.exists('diheds'): os.system('rm -rf diheds') diheds = xyz.fit_transform_with(featurizer, 'diheds/', fmt='dir-npy') scaler = RobustScaler() if os.path.exists('scaled_diheds'): os.system('rm -rf scaled_diheds') scaled_diheds = diheds.fit_transform_with(scaler, 'scaled_diheds/', fmt='dir-npy') tica_model = tICA(lag_time=1, n_components=tica_dim) tica_model = scaled_diheds.fit_with(tica_model) if os.path.exists('ticas'): os.system('rm -rf ticas') tica_trajs = scaled_diheds.transform_with(tica_model, 'ticas/', fmt='dir-npy') return tica_trajs
def fit_predict_tica_embeddings(traj_folder, reference_tica_information, outputdir, names, n_components, tica_lagtime): traj_list_array, pairwise_distance = readtrajs_from_folder(traj_folder) #calculate the tica means for the current system test_tica = tICA(lag_time=tica_lagtime, n_components=n_components) test_tica.fit(pairwise_distance) numpy.savetxt('%s/%s_pairwise_means' % (outputdir, names), test_tica.means_) for line in range(len(traj_list_array)): temp = numpy.load("%s/%s.npy" % (traj_folder, traj_list_array[line])) #we begin to project results_to_store = numpy.dot( (temp - test_tica.means_.T), reference_tica_information.eigenvectors_[:, :]) numpy.savetxt("%s/%s_ticproj.txt" % (outputdir, traj_list_array[line]), results_to_store[:, 0:n_components])
def fit_and_transform(directory, stride=5): projected_data_filename = "/scratch/users/enf/b2ar_analysis/phi_psi_chi_stride%d_projected.h5" % stride fit_model_filename = "/scratch/users/enf/b2ar_analysis/phi_psi_chi2_stride%s_tica_coords.h5" % stride #active_pdb_file = "/scratch/users/enf/b2ar_analysis/3P0G_pymol_prepped.pdb" active_pdb_file = "/scratch/users/enf/b2ar_analysis/system_B.pdb" tica_model = tICA(n_components=4) if not os.path.exists(projected_data_filename): print("loading feature files") feature_files = get_trajectory_files(directory) pool = mp.Pool(mp.cpu_count()) features = pool.map(load_features, feature_files) pool.terminate() if not os.path.exists(fit_model_filename): print("fitting data to tICA model") fit_model = tica_model.fit(features) verbosedump(fit_model, fit_model_filename) transformed_data = fit_model.transform(features) verbosedump(transformed_data, projected_data_filename) else: print("loading tICA model") fit_model = verboseload(fit_model_filename) transformed_data = fit_model.transform(features) verbosedump(transformed_data, projected_data_filename) else: fit_model = verboseload(fit_model_filename) transformed_data = verboseload(projected_data_filename) active_pdb = md.load(active_pdb_file) top = active_pdb.topology atom_indices = [ a.index for a in top.atoms if a.residue.is_protein and a.residue.resSeq != 341 and a.residue.name[0:2] != "HI" and a.residue.resSeq != 79 and a.residue.resSeq != 296 and a.residue.resSeq != 269 and a.residue. resSeq != 178 and a.residue.resSeq != 93 and a.residue.name != "NMA" and a.residue.name != "NME" and a.residue.name != "ACE" ] active_pdb = md.load(active_pdb_file, atom_indices=atom_indices) featurizer = DihedralFeaturizer(types=['phi', 'psi', 'chi2']) active_pdb_features = featurizer.transform(active_pdb) active_pdb_projected = fit_model.transform(active_pdb_features) print((active_pdb_projected[0:4]))
def test_multiple_components(): X = np.random.randn(100, 5) tica = tICA(n_components=1, gamma=0) tica.fit([X]) Y1 = tica.transform([X])[0] tica.n_components = 4 Y4 = tica.transform([X])[0] tica.n_components = 3 Y3 = tica.transform([X])[0] assert Y1.shape == (100, 1) assert Y4.shape == (100, 4) assert Y3.shape == (100, 3) eq(Y1.flatten(), Y3[:, 0]) eq(Y3, Y4[:, :3])
def ktica_test(features_dir, tica_dir, landmark_indices = None, nystroem_components=1000, tica_components=10, lag_time=5, nystroem_data_filename = "", fit_model_filename = "", projected_data_filename = ""): nys = Nystroem(n_components=nystroem_components) tica_model = tICA(n_components = tica_components, lag_time = lag_time) feature_files = get_trajectory_files(features_dir, ext = ".h5")[0:3] #if os.path.exists(nystroem_data_filename): # nyx = verboseload(nystroem_data_filename) #else: features = load_file_list(feature_files) nyx = nys.fit_transform(features) verbosedump(nyx, nystroem_data_filename) print(np.shape(nyx)) print(dir(nyx)) fit_model = tica_model.fit(nyx) verbosedump(fit_model, fit_model_filename) transformed_data = fit_model.transform(nyx) verbosedump(transformed_data, projected_data_filename) return
def train_model(args): features, feature_type = extract_features(args) print "Fitting %s model" % args.model if args.model == "PCA": model = PCA(n_components = args.n_components) model_type = PCA_MODEL projected = model.fit_transform(features) elif args.model == "SVD": model = TruncatedSVD(n_components = args.n_components) model_type = SVD_MODEL projected = model.fit_transform(features) elif args.model == "ICA": model = FastICA(n_components = args.n_components) model_type = ICA_MODEL projected = model.fit_transform(features) elif args.model == "tICA": model = tICA(n_components = args.n_components, kinetic_mapping=True, lag_time = args.lag_time) model_type = TICA_MODEL projected = model.fit_transform([features])[0] else: raise Exception, "Unknown model type '%s'", args.model print "Writing model" model = { LAG_TIME_KEY : args.lag_time, MODEL_TYPE_KEY : model_type, MODEL_KEY : model, PROJECTION_KEY : projected, FEATURE_TYPE_KEY : feature_type } joblib.dump(model, args.model_file)
def fit_and_transform(directory): print("fitting data to tICA model") tica_model = tICA(n_components=4) features = generateData(get_trajectory_files(directory)) for data in features: print(np.shape(data[0])) tica_model.partial_fit(data[0]) print("Fitting: ") print(data) transformed_data = [] for data in features: print("Transforming: ") print(data) transformed_data.append(tica_model.partial_transform(data)) verbosedump(transformed_data, "/home/enf/b2ar_analysis/phi_psi_chi_stride10_projected.h5") trajs = np.concatenate(transformed_data) plt.hexbin(trajs[:,0], trajs[:,1], bins='log', mincnt=1) plt.show()
def fit_and_transform(directory): print("fitting data to tICA model") tica_model = tICA(n_components=4) features = generateData(get_trajectory_files(directory)) for data in features: print((np.shape(data[0]))) tica_model.partial_fit(data[0]) print("Fitting: ") print(data) transformed_data = [] for data in features: print("Transforming: ") print(data) transformed_data.append(tica_model.partial_transform(data)) verbosedump(transformed_data, "/scratch/users/enf/b2ar_analysis/phi_psi_chi_stride10_projected.h5") trajs = np.concatenate(transformed_data) plt.hexbin(trajs[:,0], trajs[:,1], bins='log', mincnt=1) plt.show()
def fit_protein_tica(yaml_file,sparse=False,ksparse=None): mdl_dir = yaml_file["mdl_dir"] mdl_params = yaml_file["mdl_params"] current_mdl_params={} for i in mdl_params.keys(): if i.startswith("tica__"): current_mdl_params[i.split("tica__")[1]] = mdl_params[i] if sparse==True: protein_tica_mdl = SparseTICA(**current_mdl_params) elif type(ksparse)==int: current_mdl_params["k"] = ksparse protein_tica_mdl = KSparseTICA(**current_mdl_params) else: protein_tica_mdl = tICA(**current_mdl_params) for protein in yaml_file["protein_list"]: print("Fitting to protein %s" % protein) with enter_protein_data_dir(yaml_file, protein): if os.path.exists("./normalized_features"): featurized_traj = sorted(glob.glob("./normalized_features/*.jl"), key=keynat) else: print('Warning: features have not been scaled') featurized_traj = sorted(glob.glob("./%s/*.jl" % yaml_file["feature_dir"]), key=keynat) for f in featurized_traj: featurized_path = verboseload(f) try: protein_tica_mdl.partial_fit(featurized_path) except: print('Error') print("Done partial fitting to protein %s" % protein) # dumping the tica_mdl tica_mdl_path = os.path.join(mdl_dir, "tica_mdl.pkl") verbosedump(protein_tica_mdl, tica_mdl_path) return
def fit_and_transform(directory, stride=5): projected_data_filename = "/scratch/users/enf/b2ar_analysis/phi_psi_chi_stride%d_projected.h5" %stride fit_model_filename = "/scratch/users/enf/b2ar_analysis/phi_psi_chi2_stride%s_tica_coords.h5" %stride #active_pdb_file = "/scratch/users/enf/b2ar_analysis/3P0G_pymol_prepped.pdb" active_pdb_file = "/scratch/users/enf/b2ar_analysis/system_B.pdb" tica_model = tICA(n_components=4) if not os.path.exists(projected_data_filename): print("loading feature files") feature_files = get_trajectory_files(directory) pool = mp.Pool(mp.cpu_count()) features = pool.map(load_features, feature_files) pool.terminate() if not os.path.exists(fit_model_filename): print("fitting data to tICA model") fit_model = tica_model.fit(features) verbosedump(fit_model, fit_model_filename) transformed_data = fit_model.transform(features) verbosedump(transformed_data, projected_data_filename) else: print("loading tICA model") fit_model = verboseload(fit_model_filename) transformed_data = fit_model.transform(features) verbosedump(transformed_data, projected_data_filename) else: fit_model = verboseload(fit_model_filename) transformed_data = verboseload(projected_data_filename) active_pdb = md.load(active_pdb_file) top = active_pdb.topology atom_indices = [a.index for a in top.atoms if a.residue.is_protein and a.residue.resSeq != 341 and a.residue.name[0:2] != "HI" and a.residue.resSeq != 79 and a.residue.resSeq != 296 and a.residue.resSeq != 269 and a.residue.resSeq != 178 and a.residue.resSeq != 93 and a.residue.name != "NMA" and a.residue.name != "NME" and a.residue.name != "ACE"] active_pdb = md.load(active_pdb_file, atom_indices=atom_indices) featurizer = DihedralFeaturizer(types=['phi', 'psi', 'chi2']) active_pdb_features = featurizer.transform(active_pdb) active_pdb_projected = fit_model.transform(active_pdb_features) print(active_pdb_projected[0:4])
def fit_and_transform(features_directory, model_dir, stride=5, lag_time=10, n_components=5): if not os.path.exists(model_dir): os.makedirs(model_dir) projected_data_filename = "%s/phi_psi_chi2_allprot_projected.h5" % model_dir fit_model_filename = "%s/phi_psi_chi2_allprot_tica_coords.h5" % model_dir #active_pdb_file = "/scratch/users/enf/b2ar_analysis/renamed_topologies/A-00.pdb" tica_model = tICA(n_components=n_components, lag_time=lag_time) if not os.path.exists(projected_data_filename): print("loading feature files") feature_files = get_trajectory_files(features_directory, ext=".h5") pool = mp.Pool(mp.cpu_count()) features = pool.map(load_features, feature_files) pool.terminate() if not os.path.exists(fit_model_filename): print("fitting data to tICA model") fit_model = tica_model.fit(features) verbosedump(fit_model, fit_model_filename) transformed_data = fit_model.transform(features) verbosedump(transformed_data, projected_data_filename) else: print("loading tICA model") fit_model = verboseload(fit_model_filename) print("transforming") transformed_data = fit_model.transform(features) verbosedump(transformed_data, projected_data_filename) else: fit_model = verboseload(fit_model_filename) transformed_data = verboseload(projected_data_filename) print(fit_model.summarize())
import numpy as np import msmexplorer as msme rs = np.random.RandomState(42) # Load Fs Peptide Data trajs = FsPeptide().get().trajectories # Extract Backbone Dihedrals featurizer = DihedralFeaturizer(types=['chi1']) diheds = featurizer.fit_transform(trajs) # Perform Dimensionality Reduction tica_model = tICA(lag_time=2, n_components=2) tica_trajs = tica_model.fit_transform(diheds) # Perform Clustering clusterer = MiniBatchKMeans(n_clusters=12, random_state=rs) clustered_trajs = clusterer.fit_transform(tica_trajs) # Construct MSM msm = MarkovStateModel(lag_time=2) assignments = msm.fit_transform(clustered_trajs) # Plot Stacked Distributions a = np.concatenate(assignments, axis=0) d = np.concatenate(diheds, axis=0) # Plot Stacked Distributions of the sine of each Chi1 angle
def fit_and_transform(features_directory, model_dir, stride=5, lag_time=10, n_components = 5, wolf = True, shrinkage = None, rho = 0.05, parallel=True, sparse = True, traj_ext = ".h5"): if not os.path.exists(model_dir): os.makedirs(model_dir) projected_data_filename = "%s/phi_psi_chi2_allprot_projected.h5" %model_dir fit_model_filename = "%s/phi_psi_chi2_allprot_tica_coords.h5" %model_dir #active_pdb_file = "/scratch/users/enf/b2ar_analysis/renamed_topologies/A-00.pdb" if not sparse: if shrinkage is None: tica_model = tICA(n_components = n_components, lag_time = lag_time) else: tica_model = tICA(n_components = n_components, lag_time = lag_time, shrinkage = shrinkage) else: if shrinkage is None: tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho) else: tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho, shrinkage = shrinkage) if not os.path.exists(projected_data_filename): print("loading feature files") feature_files = get_trajectory_files(features_directory, ext = traj_ext) if len(feature_files) == 0: feature_files = get_trajectory_files(features_directory, ext = ".dataset") if not parallel: features = [] for feature_file in feature_files: #if "A-00" not in feature_file and "A-01" not in feature_file: continue #print("Loading feature files one at a time") print "loading %s" %feature_file #if sparse: # features.append(load_features(feature_file)[0:1000,0:10]) #else: features.append(load_features(feature_file)) else: pool = mp.Pool(mp.cpu_count()) features = pool.map(load_features, feature_files) pool.terminate() transpose = False for i in range(0, len(features)): if np.shape(features[0])[1] != np.shape(features[i])[1]: transpose = True break if transpose: for i in range(0, len(features)): features[i] = np.transpose(features[i]) print np.shape(features[0]) #print np.shape(features[1]) print(features[0][0][0:10]) #print(features[1][0][0:10]) print(np.shape(features)) print("fitting data to tICA model") fit_model = tica_model.fit(features) print(fit_model.summarize()) #print(dir(fit_model)) #save_dataset(fit_model, fit_model_filename) transformed_data = fit_model.transform(features) print("transformed data with tICA model") verbosedump(fit_model, fit_model_filename) print("saved tICA model") verbosedump(transformed_data, projected_data_filename) print("saved data projected onto tICA coords") else: print("already computed tICA model")
# Load trajectories ################################################################################ print ('loading trajectories...') filenames = glob(os.path.join(source_directory, '*0.h5')) trajectories = [md.load(filename) for filename in filenames] print "We are analyzing %s trajectories." % len(trajectories) ################################################################################ # initialize dihedral and tICA features ################################################################################ print('initializing dihedral and tICA features...') dihedrals = featurizer.DihedralFeaturizer(types=["chi1"]).transform(trajectories) print "We are using %s chi1 dihedral features." % len(dihedrals[0]) tica = decomposition.tICA(n_components = 4,lag_time= 1600) X = tica.fit_transform(dihedrals) ################################################################################ # Make eigenvalues plot ################################################################################ plt.clf() eigenvalues = (tica.eigenvalues_)**2 sum_eigenvalues = np.sum(eigenvalues[0:2]) print "This is the sum of the first two eigenvalues: %s." % sum_eigenvalues plt.plot(eigenvalues) plt.xlim(0,4)
def calculate_tica_components(): print("Calculating tICA components...") in_files = glob.glob("out*npy") loaded_files = [ np.load(filename) for filename in in_files ] tica = tICA(lag_time=tica_lagtime, n_components=int(tica_components)).fit_transform(loaded_files) np.save('lag_%d_comp_%d.npy' %(tica_lagtime, tica_components), tica) tica_data = 'data_lag_%d_comp_%d' %(tica_lagtime, tica_components) joblib.dump(tica, tica_data) data = np.load('lag_%d_comp_%d.npy' %(tica_lagtime, tica_components)) for i in range(len(glob.glob('out*npy'))): # extract the four tICA components for j in range(len(data[i])): tica_1.append(data[i][j][0]) tica_2.append(data[i][j][1]) tica_3.append(data[i][j][2]) tica_4.append(data[i][j][3]) # Clustering via KCenters if cluster_method == 'kcenters': print("Clustering via KCenters...") clusters = KCenters(n_clusters) elif cluster_method == 'kmeans': print("Clustering via KMeans...") clusters = KMeans(n_clusters) else: sys.exit("Invalid cluster_method. Use kmeans or kcenters.") sequences = clusters.fit_transform(tica) np.save('lag_%d_clusters_%d_sequences.npy' %(tica_lagtime, n_clusters), sequences) np.save('lag_%d_clusters_%d_center.npy' %(tica_lagtime, n_clusters), clusters.cluster_centers_) cluster_data = 'lag_%d_clusters_%d.pkl' %(tica_lagtime, n_clusters) joblib.dump(sequences, cluster_data) # Determining cluster populations print("Determining cluster populations...") counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)]) # how many frames are in each cluster normalized_counts = counts/float(counts.sum()) percentages = [ i*100 for i in normalized_counts ] # Plotting the tICA components print("Plotting tICA components with cluster centers...") plt.figure(0) # plotting tica_1, tica_2 plt.hexbin(tica_1, tica_2, bins='log') #, cmap=cmaps.viridis x_centers = [clusters.cluster_centers_[i][0] for i in range(len(clusters.cluster_centers_))] y_centers = [clusters.cluster_centers_[i][1] for i in range(len(clusters.cluster_centers_))] plt.plot(x_centers, y_centers, 'wo') for label, x, y in zip(["%.4f"%i for i in percentages], x_centers, y_centers): # adds percentage contribution for each cluster plt.annotate( label, xy = (x, y), xytext = (-20, 20), textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.savefig('tica_1_2.png') plt.figure(1) # plotting tica_1, tica_3 plt.hexbin(tica_1, tica_3, bins='log') x_centers = [clusters.cluster_centers_[i][0] for i in range(len(clusters.cluster_centers_))] y_centers = [clusters.cluster_centers_[i][2] for i in range(len(clusters.cluster_centers_))] plt.plot(x_centers, y_centers, 'wo') for label, x, y in zip([ "%.4f"%i for i in percentages], x_centers, y_centers): plt.annotate( label, xy = (x, y), xytext = (-20, 20), textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.savefig('tica_1_3.png') plt.figure(2) # plotting tica_2, tica_3 plt.hexbin(tica_2, tica_3, bins='log') x_centers = [clusters.cluster_centers_[j][1] for j in range(len(clusters.cluster_centers_))] y_centers = [clusters.cluster_centers_[j][2] for j in range(len(clusters.cluster_centers_))] plt.plot(x_centers, y_centers, 'wo') for label, x, y in zip(["%.4f"%i for i in percentages], x_centers, y_centers): plt.annotate( label, xy = (x, y), xytext = (-20, 20), textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.savefig('tica_2_3.png') # Determining cluster entropy ( this yields errors for me ) # print("Determining cluster entropy") # cluster_entropy = (-1.0*normalized_counts*np.log(normalized_counts)).sum() # np.savetxt('cluster_entropy.dat', cluster_entropy) # Determining the cluster populations and writing out PDBs for cluster centers print("Determining cluster populations...") counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)]) # how many frames are in each cluster normalized_counts = counts/float(counts.sum()) np.savetxt('populations.dat', normalized_counts) print("Performing cluster analytics and saving center PDBs...\n") for i in range(len(glob.glob("traj*xtc"))): n_snapshots = len(clusters.distances_[i]) cluster_indices = np.arange(n_snapshots)[ (clusters.distances_[i] < 1e-6) ] # frames that have centers cluster_labels = sequences[i][cluster_indices] # number of cluster if cluster_indices.size != 0: # print only the trajectories that have cluster centers for j in range(len(cluster_labels)): # for each cluster center found in this trajectory print('Cluster center', cluster_labels[j], 'was found in trajectory', str(i) + '.') print('It is found on frame', cluster_indices[j], 'and has a relative population of', "%.4f"%percentages[cluster_labels[j]], '%.') xtcfile = sorted(glob.glob("traj*xtc"))[i] for j in range(len(cluster_indices)): # actually saving the snapshots cluster_traj = md.load_frame(xtcfile, cluster_indices[j], top='structure.gro') cluster_traj.save_pdb('state_%d.pdb' %cluster_labels[j]+1) # Calculating IPTs print("\nCalculating Implied Timescales...") timescales = implied_timescales(sequences, lagtimes, n_timescales=n_timescales, msm=MarkovStateModel(verbose=False)) implied_timescale_data = 'ipt_lag_%d_clusters_%d.pkl' %(tica_lagtime, n_clusters) joblib.dump(timescales, implied_timescale_data) numpy_timescale_data = 'lag_%d_clusters_%d_timescales.npy' %(tica_lagtime, n_clusters) np.savetxt('lagtimes.txt', lagtimes) np.save(numpy_timescale_data, timescales) # Plotting IPTs (lagtimes and timescales) print("Plotting Implied Timescales...") for i in range(n_timescales): plt.figure(42) plt.plot(lagtimes * time_step, timescales[:, i] * time_step, 'o-') plt.yscale('log') plt.xlabel('lagtime (ns)') plt.ylabel('Implied timescales (ns)') plt.savefig('lag_%d_clusters_%d_.png' %(tica_lagtime, n_clusters))
def test_shape(): model = tICA(n_components=3).fit([np.random.randn(100, 10)]) eq(model.eigenvalues_.shape, (3,)) eq(model.eigenvectors_.shape, (10, 3)) eq(model.components_.shape, (3, 10))
def fit_tica(self, lag_time): self.tica = tICA(n_components=10, lag_time=lag_time, weighted_transform=True) self.tica.fit(self.seqs2d) self.ticax = self.tica.transform(self.seqs2d)
"""Reduce dimensionality with tICA msmbuilder autogenerated template version 2 created 2017-05-23T16:38:49.125259 please cite msmbuilder in any publications """ from msmbuilder.io import load_trajs, save_trajs, save_generic from msmbuilder.decomposition import tICA ## Load tica = tICA(n_components=5, lag_time=10, kinetic_mapping=True) meta, ftrajs = load_trajs("ftrajs") ## Fit tica.fit(ftrajs.values()) ## Transform ttrajs = {} for k, v in ftrajs.items(): ttrajs[k] = tica.partial_transform(v) ## Save save_trajs(ttrajs, 'ttrajs', meta) save_generic(tica, 'tica.pickl')
def test_1(): np.random.seed(42) X = np.random.randn(10, 3) tica = tICA(n_components=2, lag_time=1) y2 = tica.fit_transform([np.copy(X)])[0]
''' ''' if not (os.path.isfile("phi_psi_chi2_features_vd_stride10.h5")): print("featurizing") phi_psi_chi2 = DihedralFeaturizer(types=['phi','psi','chi2']) features = phi_psi_chi2.transform(traj_list = traj) print("finished featurizing") verbosedump(features, "phi_psi_chi2_features_vd_stride10.h5") else: print("loading existing features") features = verboseload("phi_psi_chi2_features_vd_stride10.h5") features = [np.concatenate(features)] if not (os.path.isfile("reduced_phi_psi_chi_stride10.h5")): print("Fitting tICA model") tica_model = tICA(n_components=4) fitted_model = tica_model.fit(features) reduced_data = fitted_model.transform(features) verbosedump(reduced_data, "reduced_phi_psi_chi_stride10.h5") print(tica_model.summarize()) else: reduced_data = verboseload("reduced_phi_psi_chi_stride10.h5") clusterer = KMedoids(n_clusters=9) clusters = clusterer.fit_transform(reduced_data)[0] center_locations = [] for i in range(0, len(clusters)): print i