def fit_protein_tica(yaml_file,sparse=False):
    mdl_dir = yaml_file["mdl_dir"]
    mdl_params = yaml_file["mdl_params"]

    current_mdl_params={}
    for i in mdl_params.keys():
        if i.startswith("tica__"):
            current_mdl_params[i.split("tica__")[1]] = mdl_params[i]

    if sparse==True:
        protein_tica_mdl = SparseTICA(**current_mdl_params)
    else:
        protein_tica_mdl = tICA(**current_mdl_params)

    for protein in yaml_file["protein_list"]:
        print("Fitting to protein %s" % protein)
        with enter_protein_data_dir(yaml_file, protein):
            featurized_traj = sorted(glob.glob("./%s/*.jl" %
                                               yaml_file["feature_dir"]), key=keynat)
            for f in featurized_traj:
                featurized_path = verboseload(f)
                try:
                    protein_tica_mdl.partial_fit(featurized_path)
                except:
                    pass
            print("Done partial fitting to protein %s" % protein)
    # dumping the tica_mdl
    tica_mdl_path = os.path.join(mdl_dir, "tica_mdl.pkl")
    verbosedump(protein_tica_mdl, tica_mdl_path)
    return
def fit_protein_tica(yaml_file,sparse=False,ksparse=None):
    mdl_dir = yaml_file["mdl_dir"]
    mdl_params = yaml_file["mdl_params"]

    current_mdl_params={}
    for i in mdl_params.keys():
        if i.startswith("tica__"):
            current_mdl_params[i.split("tica__")[1]] = mdl_params[i]

    if sparse==True:
        protein_tica_mdl = SparseTICA(**current_mdl_params)
    elif type(ksparse)==int:
        current_mdl_params["k"] = ksparse
        protein_tica_mdl = KSparseTICA(**current_mdl_params)
    else:
        protein_tica_mdl = tICA(**current_mdl_params)

    for protein in yaml_file["protein_list"]:
        print("Fitting to protein %s" % protein)
        with enter_protein_data_dir(yaml_file, protein):
            if os.path.exists("./normalized_features"):
                featurized_traj = sorted(glob.glob("./normalized_features/*.jl"), key=keynat)
            else:
                print('Warning: features have not been scaled')
                featurized_traj = sorted(glob.glob("./%s/*.jl" %
                                               yaml_file["feature_dir"]), key=keynat)
           
            for f in featurized_traj:
                featurized_path = verboseload(f)
                try:
                    protein_tica_mdl.partial_fit(featurized_path)
                except:
                    print('Error')
            print("Done partial fitting to protein %s" % protein)
    # dumping the tica_mdl
    tica_mdl_path = os.path.join(mdl_dir, "tica_mdl.pkl")
    verbosedump(protein_tica_mdl, tica_mdl_path)
    return
示例#3
0
def fit_and_transform(features_directory,
                      model_dir,
                      stride=5,
                      lag_time=10,
                      n_components=5,
                      wolf=True,
                      shrinkage=None,
                      rho=0.05,
                      parallel=True,
                      sparse=True,
                      traj_ext=".h5",
                      normalize=True,
                      partial_fit=True,
                      subsample=1,
                      recompute_tica=False,
                      features=None):
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    projected_data_filename = "%s/phi_psi_chi2_allprot_projected.h5" % model_dir
    fit_model_filename = "%s/phi_psi_chi2_allprot_tica_coords.h5" % model_dir
    normalizer = "%s/normalizer.h5" % features_directory
    n = compat_verboseload(normalizer)
    #active_pdb_file = "/scratch/users/enf/b2ar_analysis/renamed_topologies/A-00.pdb"

    if not sparse:
        if shrinkage is None:
            tica_model = tICA(n_components=n_components, lag_time=lag_time)
        else:
            tica_model = tICA(n_components=n_components,
                              lag_time=lag_time,
                              shrinkage=shrinkage)

    else:
        if shrinkage is None:
            tica_model = SparseTICA(n_components=n_components,
                                    lag_time=lag_time,
                                    rho=rho)
        else:
            tica_model = SparseTICA(n_components=n_components,
                                    lag_time=lag_time,
                                    rho=rho,
                                    shrinkage=shrinkage)

    if not os.path.exists(projected_data_filename) or recompute_tica:
        print("loading feature files")
        feature_files = get_trajectory_files(features_directory,
                                             ext=".dataset")

        if partial_fit:
            transformed_data = []

            for i, feature_file in enumerate(feature_files):
                print("fitting tICA model to %s" % feature_file)
                if features is None:
                    featurized_traj = load_file(feature_file)
                else:
                    featurized_traj = features[i]
                normalized_featurized_traj = n.transform(featurized_traj)
                tica_model.partial_fit(normalized_featurized_traj)

            print("Finished computing tICA model. Now transforming.")

            for i, feature_file in enumerate(feature_files):
                print("Transforming %s" % feature_file)
                if features is None:
                    featurized_traj = load_file(feature_file)
                else:
                    featurized_traj = features[i]
                normalized_featurized_traj = n.transform(featurized_traj)
                transformed_data.append(
                    tica_model.partial_transform(n.transform(featurized_traj)))

            fit_model = tica_model

        else:
            if features is None:
                if not parallel:
                    features = []
                    for feature_file in feature_files:
                        #if "A-00" not in feature_file and "A-01" not in feature_file: continue
                        #print("Loading feature files one at a time")
                        print("loading %s" % feature_file)
                        #if sparse:
                        #	features.append(load_features(feature_file)[0:1000,0:10])
                        #else:

                        features.append(
                            load_file(feature_file)[::subsample, :])
                else:
                    pool = mp.Pool(mp.cpu_count())
                    features = pool.map(load_file, feature_files)
                    pool.terminate()

            transpose = False
            for i in range(0, len(features)):
                if np.shape(features[0])[1] != np.shape(features[i])[1]:
                    transpose = True
                    break
            if transpose:
                for i in range(0, len(features)):
                    features[i] = np.transpose(features[i])
            print(np.shape(features[0]))
            #print np.shape(features[1])
            print((features[0][0][0:10]))
            #print(features[1][0][0:10])
            print((np.shape(features)))

            if normalize:
                features = [n.transform(f) for f in features]

            print("fitting data to tICA model")
            fit_model = tica_model.fit(features)

            if subsample == 1:
                transformed_data = fit_model.transform(features)
            else:
                transformed_data = [
                    fit_model.transform(n.transform(load_file(f)))
                    for f in feature_files
                ]
            print("transformed data with tICA model")

        print((fit_model.summarize()))
        #print(dir(fit_model))
        #save_dataset(fit_model, fit_model_filename)

        verbosedump(fit_model, fit_model_filename)
        print("saved tICA model")
        verbosedump(transformed_data, projected_data_filename)
        print("saved data projected onto tICA coords")

    else:
        print("already computed tICA model")