예제 #1
0
def _test_tic_dict(prj):
    p1 = Protein(prj, "kinase_1")
    #p1._get_all_tics()
    current_data = []
    rnd_tic = np.random.randint(p1.n_tics_)
    rnd_state = np.random.randint(p1.n_states_)
    for traj_index, traj_name in enumerate(p1.fixed_assignments.keys()):
        for f_i, f in enumerate(p1.fixed_assignments[traj_name]):
            if f == rnd_state:
                current_data.append(p1.tica_data[traj_name][f_i][rnd_tic])
    assert len(current_data)==len(p1.tic_data(rnd_tic)[rnd_state])
    assert current_data == p1.tic_data(rnd_tic)[rnd_state]
    return True
예제 #2
0
def _test_tic_sampling(yaml_file, protein_name, tic_list, n_frames, scheme):
    #test to make sure we are sampling right
    sample_for_all_proteins(yaml_file, [protein_name],
                            tic_list, n_frames, scheme=scheme)
    ser = ProteinSeries(yaml_file)
    prt = Protein(ser, protein_name)

    for tic_index in [0,1]:
        traj_path = os.path.join(base_dir,yaml_file["mdl_dir"],
                                 protein_name,"tic%d.xtc"%tic_index)
        traj_top = os.path.join(base_dir,yaml_file["mdl_dir"],
                                protein_name, "prot.pdb")
        tica_traj = mdt.load(traj_path,top=traj_top)
        print(tica_traj.n_frames)
        feat = DihedralFeaturizer(types=['phi', 'psi','chi1'])

        f = feat.partial_transform(tica_traj)
        t_f = np.round(prt.tica_mdl.transform([f]))

        #check that the tic goes from min to max
        print("Look here",t_f[0])
        assert t_f[0][0][tic_index] <= t_f[0][-1][tic_index]
        all_vals = []
        for traj_tica_data in prt.tica_data.values():
            all_vals.extend(traj_tica_data[:,tic_index])
            #sort it because all three sampling schemes use it

        all_vals = np.round(np.sort(all_vals))
        print(tic_index)
        print(t_f[0][:,tic_index] >= all_vals[0])
        print(t_f[0][:,tic_index] <= all_vals[-1])
        #make sure the frames are within limitsss
        assert (t_f[0][:,tic_index] >= all_vals[0]).all()
        assert (t_f[0][:,tic_index] <= all_vals[-1]).all()
    return True
예제 #3
0
def test_map_tic_component():
    yaml_file = os.path.join(base_dir,"mdl_dir","project.yaml")
    yaml_file = load_yaml_file(yaml_file)
    fit_pipeline(yaml_file["base_dir"])

    with enter_protein_data_dir(yaml_file, "kinase_1"):
        df = pd.DataFrame(verboseload(
            os.path.join(yaml_file["feature_dir"],
                         "feature_descriptor.h5")
        ))
        trj = mdt.load(os.path.join(yaml_file["protein_dir"], "fake_proj1_0_0.hdf5"))


    ser = ProteinSeries(yaml_file,base_dir)
    prt = Protein(ser, "kinase_1")

    tica_mdl = prt.tica_mdl
    tic_index=0
    t_c = tica_mdl.components_[tic_index, :]

    a_i, r_i = _map_tic_component(t_c, df, trj)

    assert len(a_i[0]) == trj.n_atoms
    assert len(r_i[0]) == trj.n_residues

    #spot check residue 0
    df2 = pd.DataFrame([i[1] for i in df.iterrows() if 0 in i[1]["resids"]])
    r0_imp = np.sum(abs(t_c[df2.index]))
    assert r0_imp==r_i[0,0]
예제 #4
0
def sample_tic_region(yaml_file,
                      protein_name,
                      tic_region,
                      n_frames=50,
                      fname=None,
                      save_trj=True):
    """
    Helper function for sampling tic in a particular tic_region.
    :param yaml_file: The projects yaml file
    :param protein_name: The name of the protein
    :param tic_region(dict): The tic_region. Can be multidimensional with
    1 number per tic coordinate(defaults to 0 for all non-mentioned regions)
    :param n_frames: The number of frames around the coordinate
    :return:
    """

    yaml_file = load_yaml_file(yaml_file)

    prj = ProteinSeries(yaml_file)
    prt = Protein(prj, protein_name)

    key_list = list(prt.tica_data.keys())
    data = [prt.tica_data[i] for i in key_list]
    indices = sample_region(data, tic_region, n_frames)

    if fname is None:
        fname = "sampled_tic_region.xtc"
    trj = _frame_loader(yaml_file, prt, key_list, indices, save_trj, fname)

    return trj
예제 #5
0
def test_msm_pull_centroid():
    yaml_file = os.path.join(base_dir,"mdl_dir","project.yaml")
    ser = ProteinSeries(yaml_file,base_dir)
    prt = Protein(ser, "kinase_1")
    trj = sample_state_centroid(yaml_file, prt.name, states='all',
                                n_frames=2,
                                output_name="centroids.xtc")
    assert(trj.n_frames==prt.n_states_*2)
    assert(os.path.isfile(os.path.join(base_dir,"mdl_dir","kinase_1","centroids.xtc")))
예제 #6
0
def test_msm_traj():
    yaml_file = os.path.join(base_dir,"mdl_dir","project.yaml")
    yaml_file = load_yaml_file(yaml_file)
    n_steps=2
    ser = ProteinSeries(yaml_file,base_dir)
    prt = Protein(ser, "kinase_2")
    starting_state = prt.msm.state_labels_[0]
    sample_msm_traj(yaml_file, "kinase_2",n_steps=n_steps,starting_state=starting_state)
    with enter_protein_mdl_dir(yaml_file, "kinase_2"):
        msm_steps = verboseload("msm_traj.pkl")
        msm_traj = mdt.load("msm_traj.xtc",top="prot.pdb")
        assert (msm_traj.n_frames==n_steps)
        assert(len(msm_steps)==n_steps)
        states = _fit_transform(prt, msm_traj)
        assert (states==msm_steps).all()
예제 #7
0
def sample_one_tic(yaml_file,
                   protein_name,
                   tic_index,
                   n_frames,
                   scheme="linear"):
    """
    :param yaml_file: The project's yaml file
    :param protein: The name of protein
    :param tic_index: Tic index to sample along
    :param n_frames: The number of frames wanted
    :return: Dumps a tic%d.xtc and tic%d.log for a given
    protein inside its model.
    """
    yaml_file = load_yaml_file(yaml_file)
    prj = ProteinSeries(yaml_file)
    prt = Protein(prj, protein_name)

    return pull_frames(yaml_file, prt, tic_index, n_frames, scheme)
예제 #8
0
def _load_protein_matrices(yaml_file, protein_name):
    """
    Helper routine to load matrices for a protein
    :param yaml_file: yaml file to work with
    :param protein_name: name of the protein
    :return:
     prj :The protein Series
     prt : The protein project
     key_mapping: mapping of the assigment matrix 0-axis to traj names
     assignment_matrix: Massive matrix of
     tics_mapping: mapping of the tics_array matrix 0-axis to traj names
     tics_array: Multi dimensional array where the 0th axis is equal to the
     number of trajectors, the 1st axis is equal to largest traj and the
     3rd dimension is equal to the number of tics in the mdl.
    """
    prj = ProteinSeries(yaml_file)
    prt = Protein(prj, protein_name)

    key_mapping, assignment_matrix = create_assignment_matrix(
        prt.fixed_assignments)
    tics_mapping, tics_array = create_tics_array(prt.fixed_assignments,
                                                 prt.kmeans_mdl, prt.tica_data)

    return prj, prt, key_mapping, assignment_matrix, tics_mapping, tics_array
예제 #9
0
def test_plotting_utils():
    with enter_temp_directory():
        base_dir = os.path.abspath(os.path.curdir)
        mdl_dir = os.path.join(base_dir, "mdl_dir")
        feature_dir = "feature_dir"
        series_name = "fake_series"
        protein_list = ["kinase_1", "kinase_2"]
        project_dict = {
            "kinase_1": [
                "fake_proj1",
            ],
            "kinase_2": ["fake_proj2"]
        }
        mdl_params = {
            'tica__n_components': 1,
            'tica__lag_time': 1,
            'tica__kinetic_mapping': True,
            'tica__shrinkage': 0.01,
            'cluster__n_clusters': 2,
            'msm__lag_time': 1,
            'bootrap__n_samples': 1
        }

        create_fake_data(base_dir, protein_list, project_dict)
        setup_series_analysis(base_dir, mdl_dir, feature_dir, series_name,
                              protein_list, project_dict, mdl_params)

        fit_pipeline(base_dir)
        prj = ProteinSeries(os.path.join(mdl_dir, "project.yaml"))

        prt1 = Protein(prj, "kinase_1")
        prt2 = Protein(prj, "kinase_2")

        prt1._mlpt_fct = 0.0
        prt2._mlpt_fct = 0.0
        n_bins = 100

        lin_spaced_tic_dict = global_tic_boundaries([prt1, prt2],
                                                    range(prt1.n_tics_),
                                                    n_bins)

        def test_bounds():
            locally_calc = {}
            for i in range(prt1.n_tics_):
                locally_calc[i] = []
                global_min = min(
                    min([min(i) for i in prt1.tica_data.values()]),
                    min([min(i) for i in prt2.tica_data.values()]))

                locally_calc[i].append(global_min)

                global_max = max(
                    max([max(i) for i in prt1.tica_data.values()]),
                    max([max(i) for i in prt2.tica_data.values()]))

                locally_calc[i].append(global_max)

            for i in range(prt1.n_tics_):
                assert (lin_spaced_tic_dict[i][0] == locally_calc[i][0])
                assert (lin_spaced_tic_dict[i][-1] == locally_calc[i][-1])
                assert (len(lin_spaced_tic_dict[i]) == n_bins)

            return True

        def test_histogram_data():
            H_dict, H_calc, _ = tica_histogram(prj,
                                               prt1, [0],
                                               x_array=lin_spaced_tic_dict[0],
                                               n_bins=None)
            assert (len(H_dict.keys()) == prt1.n_states_)
            assert (len(H_calc) == len(lin_spaced_tic_dict[0]) - 1)
            rnd_state = np.random.randint(0, prt1.n_states_)
            assert (np.allclose(
                H_dict[rnd_state],
                np.histogram(prt1.tic_dict[0][rnd_state],
                             bins=lin_spaced_tic_dict[0],
                             normed=True)[0]))
            return True

        def test_one_dim_free_energy():
            df = one_dim_tic_free_energy(prj,
                                         prt1,
                                         0,
                                         n_bins=None,
                                         lin_spaced_tic=lin_spaced_tic_dict[0],
                                         errorbars=False)

            assert ((df.protein_name == prt1.name).all())
            assert ((df.mdl_index == "mle").all())

            return True

        assert (test_bounds())
        assert (test_histogram_data())
        assert (test_one_dim_free_energy())

        return
예제 #10
0
def test_plotting_utils():
    with enter_temp_directory():
        base_dir = os.path.abspath(os.path.curdir)
        mdl_dir = os.path.join(base_dir,"mdl_dir")
        feature_dir = "feature_dir"
        series_name = "fake_series"
        protein_list = ["kinase_1", "kinase_2"]
        project_dict = {"kinase_1": ["fake_proj1",],
                            "kinase_2": ["fake_proj2"]}
        mdl_params = {'tica__n_components': 1, 'tica__lag_time': 1,
                      'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01,
                      'cluster__n_clusters': 2,'msm__lag_time': 1,
                      'bootrap__n_samples':1
                      }


        create_fake_data(base_dir, protein_list, project_dict)
        setup_series_analysis(base_dir, mdl_dir, feature_dir,
                                      series_name, protein_list,
                                      project_dict, mdl_params)

        fit_pipeline(base_dir)
        prj = ProteinSeries(os.path.join(mdl_dir,"project.yaml"))

        prt1 = Protein(prj, "kinase_1")
        prt2 = Protein(prj, "kinase_2")

        prt1._mlpt_fct = 0.0
        prt2._mlpt_fct = 0.0
        n_bins = 100

        lin_spaced_tic_dict = global_tic_boundaries([prt1, prt2],
                                                    range(prt1.n_tics_), n_bins)

        def test_bounds():
            locally_calc={}
            for i in range(prt1.n_tics_):
                locally_calc[i] =[]
                global_min = min(min([min(i) for i in prt1.tica_data.values()]),
                    min([min(i) for i in prt2.tica_data.values()]))

                locally_calc[i].append(global_min)

                global_max = max(max([max(i) for i in prt1.tica_data.values()]),
                    max([max(i) for i in prt2.tica_data.values()]))

                locally_calc[i].append(global_max)

            for i in range(prt1.n_tics_):
                assert(lin_spaced_tic_dict[i][0]==locally_calc[i][0])
                assert(lin_spaced_tic_dict[i][-1]==locally_calc[i][-1])
                assert(len(lin_spaced_tic_dict[i])==n_bins)

            return True

        def test_histogram_data():
            H_dict, H_calc, _ = tica_histogram(prj, prt1, [0],
                                               x_array=lin_spaced_tic_dict[0],
                                               n_bins=None)
            assert(len(H_dict.keys()) == prt1.n_states_)
            assert(len(H_calc) == len(lin_spaced_tic_dict[0])-1)
            rnd_state = np.random.randint(0,prt1.n_states_)
            assert(np.allclose(H_dict[rnd_state], np.histogram(prt1.tic_dict[0][rnd_state],
                                                       bins = lin_spaced_tic_dict[0],
                                                       normed=True)[0]))
            return True


        def test_one_dim_free_energy():
            df = one_dim_tic_free_energy(prj, prt1, 0, n_bins=None ,
                        lin_spaced_tic=lin_spaced_tic_dict[0], errorbars=False)

            assert((df.protein_name==prt1.name).all())
            assert((df.mdl_index=="mle").all())

            return True

        assert(test_bounds())
        assert(test_histogram_data())
        assert(test_one_dim_free_energy())


        return