def test_setup_series_analysis(): base_dir = os.path.join("./fake_series") mdl_dir = os.path.join(base_dir,"new_mdl_dir") feature_dir = "feature_dir" series_name = "fake_series" protein_list = ["fake_kinase1", "fake_kinase2"] project_dict = {"fake_kinase1": ["fake_proj1", "fake_proj2"], "fake_kinase2": ["fake_proj3"]} mdl_params = {'tica__n_components': 1, 'tica__lag_time': 2, 'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01, 'cluster__n_clusters': 174} with enter_temp_directory(): create_fake_series() setup_series_analysis(base_dir, mdl_dir, feature_dir, series_name, protein_list, project_dict, mdl_params) assert os.path.isdir(mdl_dir) for protein in protein_list: assert os.path.isdir(os.path.join(mdl_dir, protein)) assert(os.path.isfile(os.path.join(base_dir,"series.yaml"))) fin = open(os.path.join(mdl_dir,"project.yaml"), 'r') yaml_file = yaml.load(fin) assert yaml_file["base_dir"] == base_dir assert yaml_file["series_name"] == series_name assert yaml_file["protein_list"] == protein_list assert yaml_file["project_dict"] == project_dict assert yaml_file["mdl_params"] == mdl_params return
def test_project(): with enter_temp_directory(): base_dir = os.path.abspath(os.path.curdir) print(base_dir) print(type(base_dir)) mdl_dir = os.path.join(base_dir,"mdl_dir") feature_dir = "feature_dir" series_name = "fake_series" protein_list = ["kinase_1", "kinase_2"] project_dict = {"kinase_1": ["fake_proj1",], "kinase_2": ["fake_proj2"]} mdl_params = {'tica__n_components': 1, 'tica__lag_time': 1, 'tica__weighted_transform': True, 'tica__shrinkage': 0.01, 'cluster__n_clusters': 2,'msm__lag_time': 1, 'bootstrap__n_samples':1} create_fake_data(base_dir, protein_list, project_dict) setup_series_analysis(base_dir, mdl_dir, feature_dir, series_name, protein_list, project_dict, mdl_params) fit_pipeline(base_dir) prj = ProteinSeries(os.path.join(mdl_dir,"project.yaml")) assert isinstance(prj, ProteinSeries) assert isinstance(prj.tica_mdl ,tICA) assert _test_protein_without_project() assert _test_protein_with_project(prj) assert _test_tic_dict(prj) assert _test_obs_mapping(prj) return
def _setup_test(): #remove previous mdl_dir try: shutil.rmtree(os.path.join(base_dir,"mdl_dir")) except: pass setup_series_analysis(base_dir =base_dir, mdl_dir = os.path.abspath(os.path.join(base_dir,"mdl_dir")), feature_dir = "features", series_name="fake_series", protein_list = ["kinase_1", "kinase_2"], project_dict = {"kinase_1":["fake_proj1","fake_proj2"], "kinase_2":["fake_proj3"]}, mdl_params= {"cluster__n_clusters": 2, "msm__lag_time": 1, "tica__shrinkage": 0.005, "tica__lag_time": 1, "tica__n_components": 2, "tica__weighted_transform": True}, )
def test_multiple_mdls(): base_dir = os.path.join("./fake_series") mdl_dir = os.path.join(base_dir,"new_mdl_dir") feature_dir = "feature_dir" series_name = "fake_series" protein_list = ["fake_kinase1", "fake_kinase2"] project_dict = {"fake_kinase1": ["fake_proj1", "fake_proj2"], "fake_kinase2": ["fake_proj3"]} mdl_params = {'tica__n_components': 4, 'tica__lag_time': 223, 'tica__kinetic_mapping': True, 'tica__gamma': 0.0121, 'cluster__n_clusters': 212} with enter_temp_directory(): create_fake_series() for i in range(3): setup_series_analysis(base_dir, mdl_dir, feature_dir, series_name, protein_list, project_dict, mdl_params) time.sleep(1) assert len(glob.glob("./fake_series/*/project.yaml")) == 3 return
def test_pipeline(): with enter_temp_directory(): base_dir = os.path.abspath(os.path.curdir) mdl_dir = os.path.join(base_dir, "mdl_dir") feature_dir = "feature_dir" series_name = "fake_series" protein_list = ["kinase_1", "kinase_2"] project_dict = { "kinase_1": [ "fake_proj1", ], "kinase_2": ["fake_proj2"] } mdl_params = { 'tica__n_components': 1, 'tica__lag_time': 1, 'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01, 'cluster__n_clusters': 2, 'msm__lag_time': 1, 'bootstrap__n_samples': 1 } create_fake_data(base_dir, protein_list, project_dict) yaml_file = setup_series_analysis(base_dir, mdl_dir, feature_dir, series_name, protein_list, project_dict, mdl_params) fit_protein_tica(yaml_file) transform_protein_tica(yaml_file) fit_protein_kmeans(yaml_file) transform_protein_kmeans(yaml_file) fit_msms(yaml_file) fit_bootstrap(yaml_file) raw_count_obs = 0 for p in protein_list: for j in glob.glob(os.path.join(base_dir, p, feature_dir, "*.jl")): raw_count_obs += verboseload(j).shape[0] tica_mdl = verboseload(os.path.join(mdl_dir, "tica_mdl.pkl")) #make sure the mdl is seeing all the data, could probably have a far stronger test here assert tica_mdl.n_observations_ == raw_count_obs assert os.path.exists(os.path.join(mdl_dir, "kinase_1/tica_data.pkl")) assert os.path.exists(os.path.join(mdl_dir, "kinase_2/tica_data.pkl")) assert os.path.exists(os.path.join(mdl_dir, "kinase_1/msm_mdl.pkl")) assert os.path.exists(os.path.join(mdl_dir, "kinase_2/msm_mdl.pkl")) assert os.path.exists( os.path.join(mdl_dir, "kinase_2/bootstrap_msm_mdl.pkl")) assert os.path.exists(os.path.join(mdl_dir, "kmeans_mdl.pkl")) return
def test_pipeline(): with enter_temp_directory(): base_dir = os.path.abspath(os.path.curdir) mdl_dir = os.path.join(base_dir,"mdl_dir") feature_dir = "feature_dir" series_name = "fake_series" protein_list = ["kinase_1", "kinase_2"] project_dict = {"kinase_1": ["fake_proj1",], "kinase_2": ["fake_proj2"]} mdl_params = {'tica__n_components': 1, 'tica__lag_time': 1, 'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01, 'cluster__n_clusters': 2, 'msm__lag_time': 1, 'bootstrap__n_samples':1 } create_fake_data(base_dir, protein_list, project_dict) yaml_file = setup_series_analysis(base_dir, mdl_dir, feature_dir, series_name, protein_list, project_dict, mdl_params) fit_protein_tica(yaml_file) transform_protein_tica(yaml_file) fit_protein_kmeans(yaml_file) transform_protein_kmeans(yaml_file) fit_msms(yaml_file) fit_bootstrap(yaml_file) raw_count_obs = 0 for p in protein_list: for j in glob.glob(os.path.join(base_dir,p,feature_dir,"*.jl")): raw_count_obs += verboseload(j).shape[0] tica_mdl = verboseload(os.path.join(mdl_dir,"tica_mdl.pkl")) #make sure the mdl is seeing all the data, could probably have a far stronger test here assert tica_mdl.n_observations_ == raw_count_obs assert os.path.exists(os.path.join(mdl_dir,"kinase_1/tica_data.pkl")) assert os.path.exists(os.path.join(mdl_dir,"kinase_2/tica_data.pkl")) assert os.path.exists(os.path.join(mdl_dir,"kinase_1/msm_mdl.pkl")) assert os.path.exists(os.path.join(mdl_dir,"kinase_2/msm_mdl.pkl")) assert os.path.exists(os.path.join(mdl_dir,"kinase_2/bootstrap_msm_mdl.pkl")) assert os.path.exists(os.path.join(mdl_dir,"kmeans_mdl.pkl")) return
def test_slicer(): with enter_temp_directory(): base_dir = os.path.abspath(os.path.curdir) mdl_dir = os.path.join(base_dir,"mdl_dir") feature_dir = "feature_dir" series_name = "fake_series" protein_list = ["kinase_1", "kinase_2"] project_dict = {"kinase_1": ["fake_proj1",], "kinase_2": ["fake_proj2"]} mdl_params = {'tica__n_components': 1, 'tica__lag_time': 1, 'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01, 'cluster__n_clusters': 2, 'msm__lag_time': 1, 'bootstrap__n_samples':1 } create_fake_data(base_dir, protein_list, project_dict) yaml_file = setup_series_analysis(base_dir, mdl_dir, feature_dir, series_name, protein_list, project_dict, mdl_params) dict_feat_ind={} dict_feat_ind["kinase_1"] =[0, 2] dict_feat_ind["kinase_2"] =[1, 1, 0, 2] series_feature_slicer(yaml_file, dict_feat_ind) for protein in protein_list: with enter_protein_data_dir(yaml_file, protein): assert (os.path.isdir("sliced_feature_dir")) flist = glob.glob("./%s/*.jl"%feature_dir) for fname in flist: original_file = verboseload(fname) expected_file = original_file[:, dict_feat_ind[protein]] written_file = verboseload("./%s/%s"%("sliced_feature_dir", os.path.basename(fname) )) assert (expected_file==written_file).all() return
def test_plotting_utils(): with enter_temp_directory(): base_dir = os.path.abspath(os.path.curdir) mdl_dir = os.path.join(base_dir, "mdl_dir") feature_dir = "feature_dir" series_name = "fake_series" protein_list = ["kinase_1", "kinase_2"] project_dict = { "kinase_1": [ "fake_proj1", ], "kinase_2": ["fake_proj2"] } mdl_params = { 'tica__n_components': 1, 'tica__lag_time': 1, 'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01, 'cluster__n_clusters': 2, 'msm__lag_time': 1, 'bootrap__n_samples': 1 } create_fake_data(base_dir, protein_list, project_dict) setup_series_analysis(base_dir, mdl_dir, feature_dir, series_name, protein_list, project_dict, mdl_params) fit_pipeline(base_dir) prj = ProteinSeries(os.path.join(mdl_dir, "project.yaml")) prt1 = Protein(prj, "kinase_1") prt2 = Protein(prj, "kinase_2") prt1._mlpt_fct = 0.0 prt2._mlpt_fct = 0.0 n_bins = 100 lin_spaced_tic_dict = global_tic_boundaries([prt1, prt2], range(prt1.n_tics_), n_bins) def test_bounds(): locally_calc = {} for i in range(prt1.n_tics_): locally_calc[i] = [] global_min = min( min([min(i) for i in prt1.tica_data.values()]), min([min(i) for i in prt2.tica_data.values()])) locally_calc[i].append(global_min) global_max = max( max([max(i) for i in prt1.tica_data.values()]), max([max(i) for i in prt2.tica_data.values()])) locally_calc[i].append(global_max) for i in range(prt1.n_tics_): assert (lin_spaced_tic_dict[i][0] == locally_calc[i][0]) assert (lin_spaced_tic_dict[i][-1] == locally_calc[i][-1]) assert (len(lin_spaced_tic_dict[i]) == n_bins) return True def test_histogram_data(): H_dict, H_calc, _ = tica_histogram(prj, prt1, [0], x_array=lin_spaced_tic_dict[0], n_bins=None) assert (len(H_dict.keys()) == prt1.n_states_) assert (len(H_calc) == len(lin_spaced_tic_dict[0]) - 1) rnd_state = np.random.randint(0, prt1.n_states_) assert (np.allclose( H_dict[rnd_state], np.histogram(prt1.tic_dict[0][rnd_state], bins=lin_spaced_tic_dict[0], normed=True)[0])) return True def test_one_dim_free_energy(): df = one_dim_tic_free_energy(prj, prt1, 0, n_bins=None, lin_spaced_tic=lin_spaced_tic_dict[0], errorbars=False) assert ((df.protein_name == prt1.name).all()) assert ((df.mdl_index == "mle").all()) return True assert (test_bounds()) assert (test_histogram_data()) assert (test_one_dim_free_energy()) return
def test_plotting_utils(): with enter_temp_directory(): base_dir = os.path.abspath(os.path.curdir) mdl_dir = os.path.join(base_dir,"mdl_dir") feature_dir = "feature_dir" series_name = "fake_series" protein_list = ["kinase_1", "kinase_2"] project_dict = {"kinase_1": ["fake_proj1",], "kinase_2": ["fake_proj2"]} mdl_params = {'tica__n_components': 1, 'tica__lag_time': 1, 'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01, 'cluster__n_clusters': 2,'msm__lag_time': 1, 'bootrap__n_samples':1 } create_fake_data(base_dir, protein_list, project_dict) setup_series_analysis(base_dir, mdl_dir, feature_dir, series_name, protein_list, project_dict, mdl_params) fit_pipeline(base_dir) prj = ProteinSeries(os.path.join(mdl_dir,"project.yaml")) prt1 = Protein(prj, "kinase_1") prt2 = Protein(prj, "kinase_2") prt1._mlpt_fct = 0.0 prt2._mlpt_fct = 0.0 n_bins = 100 lin_spaced_tic_dict = global_tic_boundaries([prt1, prt2], range(prt1.n_tics_), n_bins) def test_bounds(): locally_calc={} for i in range(prt1.n_tics_): locally_calc[i] =[] global_min = min(min([min(i) for i in prt1.tica_data.values()]), min([min(i) for i in prt2.tica_data.values()])) locally_calc[i].append(global_min) global_max = max(max([max(i) for i in prt1.tica_data.values()]), max([max(i) for i in prt2.tica_data.values()])) locally_calc[i].append(global_max) for i in range(prt1.n_tics_): assert(lin_spaced_tic_dict[i][0]==locally_calc[i][0]) assert(lin_spaced_tic_dict[i][-1]==locally_calc[i][-1]) assert(len(lin_spaced_tic_dict[i])==n_bins) return True def test_histogram_data(): H_dict, H_calc, _ = tica_histogram(prj, prt1, [0], x_array=lin_spaced_tic_dict[0], n_bins=None) assert(len(H_dict.keys()) == prt1.n_states_) assert(len(H_calc) == len(lin_spaced_tic_dict[0])-1) rnd_state = np.random.randint(0,prt1.n_states_) assert(np.allclose(H_dict[rnd_state], np.histogram(prt1.tic_dict[0][rnd_state], bins = lin_spaced_tic_dict[0], normed=True)[0])) return True def test_one_dim_free_energy(): df = one_dim_tic_free_energy(prj, prt1, 0, n_bins=None , lin_spaced_tic=lin_spaced_tic_dict[0], errorbars=False) assert((df.protein_name==prt1.name).all()) assert((df.mdl_index=="mle").all()) return True assert(test_bounds()) assert(test_histogram_data()) assert(test_one_dim_free_energy()) return