def fit_protein_tica(yaml_file,sparse=False): mdl_dir = yaml_file["mdl_dir"] mdl_params = yaml_file["mdl_params"] current_mdl_params={} for i in mdl_params.keys(): if i.startswith("tica__"): current_mdl_params[i.split("tica__")[1]] = mdl_params[i] if sparse==True: protein_tica_mdl = SparseTICA(**current_mdl_params) else: protein_tica_mdl = tICA(**current_mdl_params) for protein in yaml_file["protein_list"]: print("Fitting to protein %s" % protein) with enter_protein_data_dir(yaml_file, protein): featurized_traj = sorted(glob.glob("./%s/*.jl" % yaml_file["feature_dir"]), key=keynat) for f in featurized_traj: featurized_path = verboseload(f) try: protein_tica_mdl.partial_fit(featurized_path) except: pass print("Done partial fitting to protein %s" % protein) # dumping the tica_mdl tica_mdl_path = os.path.join(mdl_dir, "tica_mdl.pkl") verbosedump(protein_tica_mdl, tica_mdl_path) return
def fit_protein_tica(yaml_file,sparse=False,ksparse=None): mdl_dir = yaml_file["mdl_dir"] mdl_params = yaml_file["mdl_params"] current_mdl_params={} for i in mdl_params.keys(): if i.startswith("tica__"): current_mdl_params[i.split("tica__")[1]] = mdl_params[i] if sparse==True: protein_tica_mdl = SparseTICA(**current_mdl_params) elif type(ksparse)==int: current_mdl_params["k"] = ksparse protein_tica_mdl = KSparseTICA(**current_mdl_params) else: protein_tica_mdl = tICA(**current_mdl_params) for protein in yaml_file["protein_list"]: print("Fitting to protein %s" % protein) with enter_protein_data_dir(yaml_file, protein): if os.path.exists("./normalized_features"): featurized_traj = sorted(glob.glob("./normalized_features/*.jl"), key=keynat) else: print('Warning: features have not been scaled') featurized_traj = sorted(glob.glob("./%s/*.jl" % yaml_file["feature_dir"]), key=keynat) for f in featurized_traj: featurized_path = verboseload(f) try: protein_tica_mdl.partial_fit(featurized_path) except: print('Error') print("Done partial fitting to protein %s" % protein) # dumping the tica_mdl tica_mdl_path = os.path.join(mdl_dir, "tica_mdl.pkl") verbosedump(protein_tica_mdl, tica_mdl_path) return
def fit_and_transform(features_directory, model_dir, stride=5, lag_time=10, n_components=5, wolf=True, shrinkage=None, rho=0.05, parallel=True, sparse=True, traj_ext=".h5", normalize=True, partial_fit=True, subsample=1, recompute_tica=False, features=None): if not os.path.exists(model_dir): os.makedirs(model_dir) projected_data_filename = "%s/phi_psi_chi2_allprot_projected.h5" % model_dir fit_model_filename = "%s/phi_psi_chi2_allprot_tica_coords.h5" % model_dir normalizer = "%s/normalizer.h5" % features_directory n = compat_verboseload(normalizer) #active_pdb_file = "/scratch/users/enf/b2ar_analysis/renamed_topologies/A-00.pdb" if not sparse: if shrinkage is None: tica_model = tICA(n_components=n_components, lag_time=lag_time) else: tica_model = tICA(n_components=n_components, lag_time=lag_time, shrinkage=shrinkage) else: if shrinkage is None: tica_model = SparseTICA(n_components=n_components, lag_time=lag_time, rho=rho) else: tica_model = SparseTICA(n_components=n_components, lag_time=lag_time, rho=rho, shrinkage=shrinkage) if not os.path.exists(projected_data_filename) or recompute_tica: print("loading feature files") feature_files = get_trajectory_files(features_directory, ext=".dataset") if partial_fit: transformed_data = [] for i, feature_file in enumerate(feature_files): print("fitting tICA model to %s" % feature_file) if features is None: featurized_traj = load_file(feature_file) else: featurized_traj = features[i] normalized_featurized_traj = n.transform(featurized_traj) tica_model.partial_fit(normalized_featurized_traj) print("Finished computing tICA model. Now transforming.") for i, feature_file in enumerate(feature_files): print("Transforming %s" % feature_file) if features is None: featurized_traj = load_file(feature_file) else: featurized_traj = features[i] normalized_featurized_traj = n.transform(featurized_traj) transformed_data.append( tica_model.partial_transform(n.transform(featurized_traj))) fit_model = tica_model else: if features is None: if not parallel: features = [] for feature_file in feature_files: #if "A-00" not in feature_file and "A-01" not in feature_file: continue #print("Loading feature files one at a time") print("loading %s" % feature_file) #if sparse: # features.append(load_features(feature_file)[0:1000,0:10]) #else: features.append( load_file(feature_file)[::subsample, :]) else: pool = mp.Pool(mp.cpu_count()) features = pool.map(load_file, feature_files) pool.terminate() transpose = False for i in range(0, len(features)): if np.shape(features[0])[1] != np.shape(features[i])[1]: transpose = True break if transpose: for i in range(0, len(features)): features[i] = np.transpose(features[i]) print(np.shape(features[0])) #print np.shape(features[1]) print((features[0][0][0:10])) #print(features[1][0][0:10]) print((np.shape(features))) if normalize: features = [n.transform(f) for f in features] print("fitting data to tICA model") fit_model = tica_model.fit(features) if subsample == 1: transformed_data = fit_model.transform(features) else: transformed_data = [ fit_model.transform(n.transform(load_file(f))) for f in feature_files ] print("transformed data with tICA model") print((fit_model.summarize())) #print(dir(fit_model)) #save_dataset(fit_model, fit_model_filename) verbosedump(fit_model, fit_model_filename) print("saved tICA model") verbosedump(transformed_data, projected_data_filename) print("saved data projected onto tICA coords") else: print("already computed tICA model")