def fit_and_transform(features_directory, model_dir, stride=5, lag_time=10, n_components=5, wolf=True, shrinkage=None, rho=0.05, parallel=True, sparse=True, traj_ext=".h5", normalize=True, partial_fit=True, subsample=1, recompute_tica=False, features=None): if not os.path.exists(model_dir): os.makedirs(model_dir) projected_data_filename = "%s/phi_psi_chi2_allprot_projected.h5" % model_dir fit_model_filename = "%s/phi_psi_chi2_allprot_tica_coords.h5" % model_dir normalizer = "%s/normalizer.h5" % features_directory n = compat_verboseload(normalizer) #active_pdb_file = "/scratch/users/enf/b2ar_analysis/renamed_topologies/A-00.pdb" if not sparse: if shrinkage is None: tica_model = tICA(n_components=n_components, lag_time=lag_time) else: tica_model = tICA(n_components=n_components, lag_time=lag_time, shrinkage=shrinkage) else: if shrinkage is None: tica_model = SparseTICA(n_components=n_components, lag_time=lag_time, rho=rho) else: tica_model = SparseTICA(n_components=n_components, lag_time=lag_time, rho=rho, shrinkage=shrinkage) if not os.path.exists(projected_data_filename) or recompute_tica: print("loading feature files") feature_files = get_trajectory_files(features_directory, ext=".dataset") if partial_fit: transformed_data = [] for i, feature_file in enumerate(feature_files): print("fitting tICA model to %s" % feature_file) if features is None: featurized_traj = load_file(feature_file) else: featurized_traj = features[i] normalized_featurized_traj = n.transform(featurized_traj) tica_model.partial_fit(normalized_featurized_traj) print("Finished computing tICA model. Now transforming.") for i, feature_file in enumerate(feature_files): print("Transforming %s" % feature_file) if features is None: featurized_traj = load_file(feature_file) else: featurized_traj = features[i] normalized_featurized_traj = n.transform(featurized_traj) transformed_data.append( tica_model.partial_transform(n.transform(featurized_traj))) fit_model = tica_model else: if features is None: if not parallel: features = [] for feature_file in feature_files: #if "A-00" not in feature_file and "A-01" not in feature_file: continue #print("Loading feature files one at a time") print("loading %s" % feature_file) #if sparse: # features.append(load_features(feature_file)[0:1000,0:10]) #else: features.append( load_file(feature_file)[::subsample, :]) else: pool = mp.Pool(mp.cpu_count()) features = pool.map(load_file, feature_files) pool.terminate() transpose = False for i in range(0, len(features)): if np.shape(features[0])[1] != np.shape(features[i])[1]: transpose = True break if transpose: for i in range(0, len(features)): features[i] = np.transpose(features[i]) print(np.shape(features[0])) #print np.shape(features[1]) print((features[0][0][0:10])) #print(features[1][0][0:10]) print((np.shape(features))) if normalize: features = [n.transform(f) for f in features] print("fitting data to tICA model") fit_model = tica_model.fit(features) if subsample == 1: transformed_data = fit_model.transform(features) else: transformed_data = [ fit_model.transform(n.transform(load_file(f))) for f in feature_files ] print("transformed data with tICA model") print((fit_model.summarize())) #print(dir(fit_model)) #save_dataset(fit_model, fit_model_filename) verbosedump(fit_model, fit_model_filename) print("saved tICA model") verbosedump(transformed_data, projected_data_filename) print("saved data projected onto tICA coords") else: print("already computed tICA model")