def featurize_project_wrapper(yaml_file, protein, feat=None, stride=1, view=None, protein_only=True):
    """
    Wrapper function for featurizing project.
    :param yaml_file: The yaml file to work with
    :param protein: Protein Name
    :param feat: Featurization obj. If none, it defaults to
    phi, psi and chi1. Should support a describe_features attribute
    :param view: ipython view or pool view to parallelize over.
    :return:
    """

    yaml_file = load_yaml_file(yaml_file)
    base_dir = yaml_file["base_dir"]

    _check_output_folder_exists(yaml_file, protein)
    #get the paths
    if protein_only:
        traj_folder = os.path.join(base_dir, protein, yaml_file["protein_dir"])
    else:
        traj_folder = os.path.join(base_dir, protein, "trajectories")
    traj_files = sorted(glob.glob(os.path.join(traj_folder,"*.hdf5" )),
                        key=keynat)
    print("Found %d files for featurization in %s"
          %(len(traj_files), traj_folder))

    jobs = [(yaml_file, protein, feat, traj_file, stride) for traj_file in traj_files]

    result = view.map(featurize_file, jobs)


    return result
Пример #2
0
def sample_tic_region(yaml_file, protein_name, tic_region,
                      n_frames=50, fname=None,save_trj=True):
    """
    Helper function for sampling tic in a particular tic_region.
    :param yaml_file: The projects yaml file
    :param protein_name: The name of the protein
    :param tic_region(dict): The tic_region. Can be multidimensional with
    1 number per tic coordinate(defaults to 0 for all non-mentioned regions)
    :param n_frames: The number of frames around the coordinate
    :return:
    """

    yaml_file = load_yaml_file(yaml_file)

    prj = ProteinSeries(yaml_file)
    prt = Protein(prj, protein_name)

    key_list = list(prt.tica_data.keys())
    data = [prt.tica_data[i] for i in key_list]
    indices = sample_region(data, tic_region, n_frames)

    if fname is None:
        fname = "sampled_tic_region.xtc"
    trj =_frame_loader(yaml_file, prt, key_list, indices, save_trj, fname)

    return trj
Пример #3
0
def sample_for_all_proteins(yaml_file, protein=None, tics=None, n_frames=100,
                            scheme="linear"):
    """
    :param yaml_file: The project yaml file.
    :param protein: The name of the protein. If none, then it is
    done for all the protein names in the yaml_file. If it is a list,
    it is iteratively done for each of the protein else its only called
    once.
    :param tics: list of tics to sample from. If None, then
    it is done for all the tics specified in the yaml file
    :param n_frames number of frames wanted for each tic
    :param scheme:One of 3 sampling schemes
    linear:Samples the tic linearly
    random:Samples the tic randomly
    edge: Samples the tic edges only
    :return:
    """

    yaml_file = load_yaml_file(yaml_file)
    if protein is None :
        protein =  yaml_file[protein_list]

    if tics==None:
        tics = range(yaml_file["params"]["tica__n_components"])

    for protein_name in protein:
        for tic_index in tics:
            sample_one_tic(yaml_file, protein_name, tic_index, n_frames,
                           scheme)

    return
Пример #4
0
def sample_tic_region(yaml_file,
                      protein_name,
                      tic_region,
                      n_frames=50,
                      fname=None,
                      save_trj=True):
    """
    Helper function for sampling tic in a particular tic_region.
    :param yaml_file: The projects yaml file
    :param protein_name: The name of the protein
    :param tic_region(dict): The tic_region. Can be multidimensional with
    1 number per tic coordinate(defaults to 0 for all non-mentioned regions)
    :param n_frames: The number of frames around the coordinate
    :return:
    """

    yaml_file = load_yaml_file(yaml_file)

    prj = ProteinSeries(yaml_file)
    prt = Protein(prj, protein_name)

    key_list = list(prt.tica_data.keys())
    data = [prt.tica_data[i] for i in key_list]
    indices = sample_region(data, tic_region, n_frames)

    if fname is None:
        fname = "sampled_tic_region.xtc"
    trj = _frame_loader(yaml_file, prt, key_list, indices, save_trj, fname)

    return trj
Пример #5
0
def sample_for_all_proteins(yaml_file,
                            protein=None,
                            tics=None,
                            n_frames=100,
                            scheme="linear"):
    """
    :param yaml_file: The project yaml file.
    :param protein: The name of the protein. If none, then it is
    done for all the protein names in the yaml_file. If it is a list,
    it is iteratively done for each of the protein else its only called
    once.
    :param tics: list of tics to sample from. If None, then
    it is done for all the tics specified in the yaml file
    :param n_frames number of frames wanted for each tic
    :param scheme:One of 3 sampling schemes
    linear:Samples the tic linearly
    random:Samples the tic randomly
    edge: Samples the tic edges only
    :return:
    """

    yaml_file = load_yaml_file(yaml_file)
    if protein is None:
        protein = yaml_file[protein_list]

    if tics == None:
        tics = range(yaml_file["params"]["tica__n_components"])

    for protein_name in protein:
        for tic_index in tics:
            sample_one_tic(yaml_file, protein_name, tic_index, n_frames,
                           scheme)

    return
def featurize_file(job_tuple):

    yaml_file, protein, feat, traj_file,stride = job_tuple
    yaml_file = load_yaml_file(yaml_file)

    if feat is None:
        feat = DihedralFeaturizer(types=['phi', 'psi','chi1'])

    _check_output_folder_exists(yaml_file, protein)

    output_folder = os.path.join(yaml_file["base_dir"],
                                 protein,
                                 yaml_file["feature_dir"])

    traj_name = os.path.splitext(os.path.basename(traj_file))[0]
    output_fname = os.path.join(output_folder, traj_name+".jl")

    feat_descriptor = os.path.join(output_folder, "feature_descriptor.h5")
    try:
        trj = mdt.load(traj_file)
    except :
        warnings.warn("Removing %s because of misformed trajectory"%traj_file)
        os.remove(traj_file)
        return

    features = feat.partial_transform(trj)
    verbosedump(features, output_fname)

    if not os.path.isfile(feat_descriptor) and hasattr(feat, "describe_features"):
        dih_df = pd.DataFrame(feat.describe_features(trj[0]))
        verbosedump(dih_df, feat_descriptor)

    return
Пример #7
0
def test_dihedral_feat():

    print(base_dir)
    pool = Pool(6)
    yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml"))

    for prt in ["kinase_1", "kinase_2"]:
        print(prt)
        prj = yaml_file["project_dict"][prt][0]
        featurize_project_wrapper(yaml_file, prt, feat=None, stride=1, view=pool)

        feat = DihedralFeaturizer(types=['phi', 'psi','chi1'])
        flist = glob.glob(os.path.join(base_dir, prt , yaml_file["protein_dir"],"*.hdf5"))
        for i in np.random.choice(flist, 3):
            trj = mdt.load(i)
            my_feat = feat.partial_transform(trj)
            expected_fname = os.path.join(base_dir, prt,
                                          yaml_file["feature_dir"],
                                          os.path.splitext(os.path.basename(i))[0]+".jl")
            calc_feat = verboseload(expected_fname)

            assert np.allclose(my_feat, calc_feat)



    return True
Пример #8
0
def test_subsampler():
    print(base_dir)
    yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml"))
    out_dir = "sub_protein_traj"
    subsample_series(yaml_file,out_dir=out_dir,overwrite=False)
    assert(os.path.isdir(os.path.join(base_dir,"kinase_1",out_dir)))
    for k in ["kinase_1","kinase_2"]:
        for i in glob.glob(os.path.join(base_dir,k, "protein_traj","*.hdf5")):
            t1 = mdt.load(i)
            t2 = mdt.load(os.path.join(base_dir,k ,out_dir, os.path.basename(i)))
            assert (t1.n_frames==t2.n_frames*5)
def _check_output_folder_exists(yaml_file, protein, folder_name=None):
    yaml_file = load_yaml_file(yaml_file)
    if folder_name is None:
        folder_name= yaml_file["feature_dir"]
    output_folder = os.path.join(yaml_file["base_dir"],
                                  protein,folder_name)

    if not os.path.isdir(output_folder):
        os.mkdir(output_folder)

    return
Пример #10
0
def test_get_common_features_2():
    yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml"))
    aligned_dict={}
    for protein in yaml_file["protein_list"]:
        t = load_random_traj(yaml_file, protein)
        aligned_dict[protein] = t.top.to_fasta(chain=0)

    f= DihedralFeaturizer(types=['phi','psi','chi1'])
    common_feature_dic,_ = _get_common_features(yaml_file,f, aligned_dict, False)
    assert(len(set([len(common_feature_dic[i]) for i in yaml_file["protein_list"]]))==1)
    return
Пример #11
0
def test_present_for_all_same_seq():
    yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml"))
    aligned_dict={}
    for protein in yaml_file["protein_list"]:
        t = load_random_traj(yaml_file, protein)
        aligned_dict[protein] = t.top.to_fasta(chain=0)

    for protein in yaml_file["protein_list"]:
        aligned_seq = aligned_dict[protein]
        prt_mapping, prt_seq =_map_residue_ind_seq_ind(yaml_file, protein, aligned_seq)
        assert(len(_present_for_all(protein, prt_mapping, prt_seq, aligned_dict))==len(prt_seq))
    return
Пример #12
0
def subsample_series(yaml_file,stride=5,out_dir="sub_protein_traj",overwrite=True):
    yaml_file = load_yaml_file(yaml_file)
    for protein in yaml_file["protein_list"]:
        subsample_protein(yaml_file,protein, stride, out_dir)
    yaml_file["protein_dir"] = out_dir
    #write the new yaml file
    if overwrite:
        with open(os.path.join(yaml_file["mdl_dir"],
                           'project.yaml'), 'w') as yaml_out:
            yaml_out.write(yaml.dump(yaml_file))

    return
Пример #13
0
def validate_series(yaml_file, sequence_dictionary):
    """
    :param yaml_file: The mdl yaml file.
    :param sequence_dictionary: Dictionary of sequences
    :return: Runs a large number of sequence tests on the series to make sure
    the sequences for each protein match the given sequence and the series itself
    """
    yaml_file = load_yaml_file(yaml_file)
    p = Pool(cpu_count())
    jobs = [(yaml_file, protein, sequence_dictionary) for protein in yaml_file["protein_list"]]
    p.map(_validate_protein, jobs)

    return
Пример #14
0
def test_get_common_residues():
    yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml"))
    aligned_dict={}
    for protein in yaml_file["protein_list"]:
        t = load_random_traj(yaml_file, protein)
        aligned_dict[protein] = t.top.to_fasta(chain=0)

    res_dic,prt_seq =  _get_common_residues(yaml_file, aligned_dict)
    for protein in yaml_file["protein_list"]:
        print(len(res_dic[protein]),t.n_residues)
        assert(len(res_dic[protein])==len(t.top.to_fasta(chain=0)))

    return
Пример #15
0
def test_get_common_features():
    yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml"))
    aligned_dict={}
    for protein in yaml_file["protein_list"]:
        t = load_random_traj(yaml_file, protein)
        aligned_dict[protein] = t.top.to_fasta(chain=0)

    f= DihedralFeaturizer()
    common_feature_dic,_ = _get_common_features(yaml_file,f, aligned_dict, False)
    for protein in yaml_file["protein_list"]:
        t = load_random_traj(yaml_file, protein)
        assert(len(common_feature_dic[protein])==f.transform(t)[0].shape[1])

    return
Пример #16
0
def sample_one_tic(yaml_file,protein_name,tic_index,n_frames, scheme="linear"):
    """
    :param yaml_file: The project's yaml file
    :param protein: The name of protein
    :param tic_index: Tic index to sample along
    :param n_frames: The number of frames wanted
    :return: Dumps a tic%d.xtc and tic%d.log for a given
    protein inside its model.
    """
    yaml_file = load_yaml_file(yaml_file)
    prj = ProteinSeries(yaml_file)
    prt = Protein(prj, protein_name)

    return pull_frames(yaml_file, prt, tic_index, n_frames, scheme)
Пример #17
0
def validate_series(yaml_file, sequence_dictionary):
    """
    :param yaml_file: The mdl yaml file.
    :param sequence_dictionary: Dictionary of sequences
    :return: Runs a large number of sequence tests on the series to make sure
    the sequences for each protein match the given sequence and the series itself
    """
    yaml_file = load_yaml_file(yaml_file)
    p = Pool(cpu_count())
    jobs = [(yaml_file, protein, sequence_dictionary)
            for protein in yaml_file["protein_list"]]
    p.map(_validate_protein, jobs)

    return
Пример #18
0
def test_subsampler():
    print(base_dir)
    yaml_file = load_yaml_file(
        os.path.join(base_dir, "mdl_dir", "project.yaml"))
    out_dir = "sub_protein_traj"
    subsample_series(yaml_file, out_dir=out_dir, overwrite=False)
    assert (os.path.isdir(os.path.join(base_dir, "kinase_1", out_dir)))
    for k in ["kinase_1", "kinase_2"]:
        for i in glob.glob(os.path.join(base_dir, k, "protein_traj",
                                        "*.hdf5")):
            t1 = mdt.load(i)
            t2 = mdt.load(
                os.path.join(base_dir, k, out_dir, os.path.basename(i)))
            assert (t1.n_frames == t2.n_frames * 5)
Пример #19
0
def test_series_slicer(yaml_file, folder_name="sliced_feature_dir"):
    yaml_file = load_yaml_file(yaml_file)

    df_dict={}
    for protein in yaml_file["protein_list"]:
        with enter_protein_data_dir(yaml_file, protein):
            df_dict[protein] = verboseload(os.path.join(os.getcwd(),
                folder_name,"feature_descriptor.h5"))
    for ind,protein in enumerate(yaml_file["protein_list"]):
        for ind2, protein2 in enumerate(yaml_file["protein_list"]):
            assert (df_dict[protein].resnames==
                    df_dict[protein2].resnames).all()

    return
Пример #20
0
def test_series_slicer(yaml_file, folder_name="sliced_feature_dir"):
    yaml_file = load_yaml_file(yaml_file)

    df_dict={}
    for protein in yaml_file["protein_list"]:
        with enter_protein_data_dir(yaml_file, protein):
            df_dict[protein] = verboseload(os.path.join(os.getcwd(),
                folder_name,"feature_descriptor.h5"))
    for ind,protein in enumerate(yaml_file["protein_list"]):
        for ind2, protein2 in enumerate(yaml_file["protein_list"]):
            assert (df_dict[protein].resnames==
                    df_dict[protein2].resnames).all()

    return
Пример #21
0
def test_map_residue_seq_with_insert():
    yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml"))
    aligned_dict={}
    for protein in yaml_file["protein_list"]:
        expected = {}

        t = load_random_traj(yaml_file, protein)

        expected[protein] = [i.index+3 for i in t.top.residues if i.is_protein]
        aligned_dict[protein] = "---"+ t.top.to_fasta(chain=0)
        aligned_seq = aligned_dict[protein]
        actual,_ =_map_residue_ind_seq_ind(yaml_file, protein, aligned_seq)
        assert expected[protein] == list(actual.values())

    return
Пример #22
0
def test_normalize_features():

    print(base_dir)
    yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml"))
    normalize_project_series(yaml_file, stride=1)

    all_data=[]

    for kinase in ["kinase_1","kinase_2"]:
        flist = glob.glob(os.path.join(base_dir, "%s"%kinase , "normalized_features/*.jl"))

        for i in flist:
            all_data.extend(verboseload(i))

    assert(np.alltrue(np.isclose(np.mean(all_data, axis=1), 0 , atol=0.2)))
    assert(np.alltrue(np.isclose(np.std(all_data, axis=1), 1 , atol=0.3)))
Пример #23
0
def pull_features(yaml_file, prt, skip=1, feature_indices=None):
    """
    Simple utility to pull certain features from the feature_folder object
    :param prt: Protein model to use
    :param skip: skip for each file(defaults to 1)
    :param feature_indices: which indices to pull
    :return: dictionary keyed on file name with feature values as arrays
    """
    yaml_file = load_yaml_file(yaml_file)
    all_f ={}
    with enter_protein_data_dir(yaml_file, prt.name):
        feature_file_list = glob.glob("./%s/*.jl"%yaml_file["feature_dir"])
        for i in feature_file_list:
            all_f[os.path.basename(i)]=load(i)[:, feature_indices]

    return all_f
Пример #24
0
def normalize_project_series(yaml_file, output_folder="normalized_features",
                             stride=40,nrm=None):
    """
    routine to take a set of proteins features stored in the feature_dir and
    normalize them by removing the mean and setting variance to 1 using the standard
    scaler. The normalizer is dumped into the mdl dir.
    :param yaml_file: The yaml file to work with.
    :param output_folder: The name of the output folder to dump normalized features in
    :param stride: The initial stride in files to fit the normalizer with.
    This is necessary to prevent memory errors. defaults to every 40th file
    :param nrm: previously fit normalizer. else it uses the standard scaler from
    scikitlearn
    :return:
    """
    yaml_file = load_yaml_file(yaml_file)
    #setup normalizer
    if nrm is None:
        nrm = preprocessing.StandardScaler()
        all_data = {}
        for prt in yaml_file["protein_list"]:
            with enter_protein_data_dir(yaml_file, prt):
                print(prt)
                flist = glob.glob("./%s/*.jl"%(yaml_file["feature_dir"]))[::stride]
                for f in flist:
                     all_data[f]=verboseload(f)

        seq=[]
        for i in all_data.keys():
           seq.extend(all_data[i])

        #fit it
        nrm.fit(seq)
        #dump it into the mdl dir.
        verbosedump(nrm,"%s/nrm.h5"%yaml_file["mdl_dir"])

    for prt in yaml_file["protein_list"]:
        _check_output_folder_exists(yaml_file, prt, output_folder)

        with enter_protein_data_dir(yaml_file, prt):
            output_folder_path = os.path.abspath(output_folder)
            flist = glob.glob("./%s/*.jl"%(yaml_file["feature_dir"]))
            for f in flist:
                res = verboseload(f)
                res = nrm.transform(res)
                verbosedump(res,"%s/%s"%(output_folder_path, os.path.basename(f)))

    return
Пример #25
0
def normalize_project_series(yaml_file, output_folder="normalized_features",
                             stride=1,nrm=None):
    """
    routine to take a set of proteins features stored in the feature_dir and
    normalize them by removing the mean and setting variance to 1 using the standard
    scaler. The normalizer is dumped into the mdl dir.
    :param yaml_file: The yaml file to work with.
    :param output_folder: The name of the output folder to dump normalized features in
    :param stride: The initial stride in files to fit the normalizer with.
    This is necessary to prevent memory errors. defaults to every 40th file
    :param nrm: previously fit normalizer. else it uses the standard scaler from
    scikitlearn
    :return:
    """
    yaml_file = load_yaml_file(yaml_file)
    #setup normalizer
    if nrm is None:
        nrm = preprocessing.RobustScaler()
        all_data = {}
        for prt in yaml_file["protein_list"]:
            with enter_protein_data_dir(yaml_file, prt):
                print(prt)
                flist = glob.glob("./%s/*.jl"%(yaml_file["feature_dir"]))[::stride]
                for f in flist:
                     all_data[f]=verboseload(f)

        seq=[]
        for i in all_data.keys():
           seq.extend(all_data[i])

        #fit it
        nrm.fit(seq)
        #dump it into the mdl dir.
        verbosedump(nrm,"%s/nrm.h5"%yaml_file["mdl_dir"])

    for prt in yaml_file["protein_list"]:
        _check_output_folder_exists(yaml_file, prt, output_folder)

        with enter_protein_data_dir(yaml_file, prt):
            output_folder_path = os.path.abspath(output_folder)
            flist = glob.glob("./%s/*.jl"%(yaml_file["feature_dir"]))
            for f in flist:
                res = verboseload(f)
                res = nrm.transform(res)
                verbosedump(res,"%s/%s"%(output_folder_path, os.path.basename(f)))

    return
Пример #26
0
def test_map_residue_seq_with_insert_at_end():
    yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml"))
    aligned_dict={}
    for protein in yaml_file["protein_list"]:
        expected = {}

        t = load_random_traj(yaml_file, protein)
        #add an insertion AFTER 10 residues. We expect all but the 10 have

        expected[protein] = [i for i in range(t.n_residues) if t.top.residue(i).code is not None]

        aligned_dict[protein] = t.top.to_fasta(chain=0)+"---"

        aligned_seq = aligned_dict[protein]
        actual,_ =_map_residue_ind_seq_ind(yaml_file, protein, aligned_seq)

        assert expected[protein] == list(actual.values())
    return
Пример #27
0
def sample_one_tic(yaml_file,
                   protein_name,
                   tic_index,
                   n_frames,
                   scheme="linear"):
    """
    :param yaml_file: The project's yaml file
    :param protein: The name of protein
    :param tic_index: Tic index to sample along
    :param n_frames: The number of frames wanted
    :return: Dumps a tic%d.xtc and tic%d.log for a given
    protein inside its model.
    """
    yaml_file = load_yaml_file(yaml_file)
    prj = ProteinSeries(yaml_file)
    prt = Protein(prj, protein_name)

    return pull_frames(yaml_file, prt, tic_index, n_frames, scheme)
def test_normalize_features():

    print(base_dir)
    yaml_file = load_yaml_file(
        os.path.join(base_dir, "mdl_dir", "project.yaml"))
    normalize_project_series(yaml_file, stride=1)

    all_data = []

    for kinase in ["kinase_1", "kinase_2"]:
        flist = glob.glob(
            os.path.join(base_dir, "%s" % kinase, "normalized_features/*.jl"))

        for i in flist:
            all_data.extend(verboseload(i))

    assert (np.alltrue(np.isclose(np.mean(all_data, axis=1), 0, atol=0.2)))
    assert (np.alltrue(np.isclose(np.std(all_data, axis=1), 1, atol=0.3)))
Пример #29
0
def featurize_series(yaml_file, ip_view, protein_list = None):
    """
    :param yaml_file: The yaml file to work with
    :param ip_view: ipython view(required)
    :param protein_list: list of proteins, if None then all
    the proteins in yaml_file["protein_list"] are processed
    :return: converted and concatenated trajectories in
    yaml_file["base_dir"]+protein_name+trajectories
    and the stripped files in
    yaml_file["base_dir"]+protein_name+protein_traj
    """

    yaml_file = load_yaml_file(yaml_file)

    if protein_list is None:
        protein_list = yaml_file["protein_list"]

    for protein in protein_list:
        featurize_project_wrapper(yaml_file, protein, None, 1, ip_view)
    return
Пример #30
0
def subsample_protein(yaml_file, protein, stride=5,out_dir="sub_protein_traj"):
    yaml_file=load_yaml_file(yaml_file)

    p=Pool(int(cpu_count()/2))

    with enter_protein_data_dir(yaml_file, protein):
        flist = [os.path.abspath(i) for i in
                 glob.glob("%s/*.hdf5"%yaml_file["protein_dir"])]

    base_dir = yaml_file["base_dir"]
    new_output_dir = os.path.join(base_dir,protein,out_dir)
    if not os.path.isdir(new_output_dir):
        os.mkdir(new_output_dir)
    fout = [os.path.join(new_output_dir,os.path.basename(i)) for i in flist]

    zippy = zip(flist, fout, itertools.repeat(stride))

    jobs= [(i,o,s) for i,o,s in zippy]
    p.map(subsample_traj,jobs)
    return
Пример #31
0
def featurize_series(yaml_file, ip_view, protein_list=None):
    """
    :param yaml_file: The yaml file to work with
    :param ip_view: ipython view(required)
    :param protein_list: list of proteins, if None then all
    the proteins in yaml_file["protein_list"] are processed
    :return: converted and concatenated trajectories in
    yaml_file["base_dir"]+protein_name+trajectories
    and the stripped files in
    yaml_file["base_dir"]+protein_name+protein_traj
    """

    yaml_file = load_yaml_file(yaml_file)

    if protein_list is None:
        protein_list = yaml_file["protein_list"]

    for protein in protein_list:
        featurize_project_wrapper(yaml_file, protein, None, 1, ip_view)
    return
Пример #32
0
def series_feature_slicer(yaml_file, dict_feat_ind=None,
                          featurizer=None,
                          folder_name="sliced_feature_dir",
                         view=None):

    """
    :param yaml_file: The project yaml file with
    :param dict_feat_ind: Dict of wanted feature indices for each protein. Defaults to
    none when you want the code to figure out what features to keep.
    :param featurizer: The featurizer object that was used to generat.
    :param folder_name: Name of the output folder. Defaults to sliced_feature_dir
    :param view: pool of workers. Defaults to multiprocessing
    :return: None
    """

    yaml_file = load_yaml_file(yaml_file)

    if view is None:
        view = Pool()

    #if we want to do this and we cant find the sequence
    if dict_feat_ind is None and ("alignment_file" not in yaml_file
                                  or featurizer is None
                                  or (not hasattr(featurizer, "describe_features"))):
        raise ValueError("To find common features, we need both "
                         "the alignment file in the yaml file"
                         "AND a featurizer obj that supports describe_features")


    if dict_feat_ind is None:
        #load alignment file
        aligned_dict = _parse_alignment_file(yaml_file["alignment_file"])
        #get list of common residue indices
        #dict_common_res, prt_mapping = _get_common_residues(yaml_file, aligned_dict)
        #get list of feature indices
        dict_feat_ind, df_dict = _get_common_features(yaml_file, featurizer, aligned_dict)

    _feature_slicer(yaml_file, dict_feat_ind, folder_name, view)

    return
Пример #33
0
def series_feature_slicer(yaml_file, dict_feat_ind=None,
                          featurizer=None,
                          folder_name="sliced_feature_dir",
                         view=None):

    """
    :param yaml_file: The project yaml file with
    :param dict_feat_ind: Dict of wanted feature indices for each protein. Defaults to
    none when you want the code to figure out what features to keep.
    :param featurizer: The featurizer object that was used to generat.
    :param folder_name: Name of the output folder. Defaults to sliced_feature_dir
    :param view: pool of workers. Defaults to multiprocessing
    :return: None
    """

    yaml_file = load_yaml_file(yaml_file)

    if view is None:
        view = Pool()

    #if we want to do this and we cant find the sequence
    if dict_feat_ind is None and ("alignment_file" not in yaml_file
                                  or featurizer is None
                                  or (not hasattr(featurizer, "describe_features"))):
        raise ValueError("To find common features, we need both "
                         "the alignment file in the yaml file"
                         "AND a featurizer obj that supports describe_features")


    if dict_feat_ind is None:
        #load alignment file
        aligned_dict = _parse_alignment_file(yaml_file["alignment_file"])
        #get list of common residue indices
        #dict_common_res, prt_mapping = _get_common_residues(yaml_file, aligned_dict)
        #get list of feature indices
        dict_feat_ind, df_dict = _get_common_features(yaml_file, featurizer, aligned_dict)

    _feature_slicer(yaml_file, dict_feat_ind, folder_name, view)

    return
Пример #34
0
def test_map_residue_seq_with_two_inserts():
    yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml"))
    aligned_dict={}
    for protein in yaml_file["protein_list"]:
        expected = {}

        t = load_random_traj(yaml_file, protein)
        #add an insertion AFTER 10 residues. and then again at 20
        expected[protein] = [i for i in range(10) if t.top.residue(i).code is not None] + \
                            [i+3 for i in range(10, 20)  if t.top.residue(i).code is not None]+\
                            [i+5 for i in range(20, t.n_residues) if t.top.residue(i).code is not None]

        prt_code = t.top.to_fasta(chain=0)
        aligned_dict[protein] = prt_code[:10]+\
                                "---"+ \
                                prt_code[10:20]+\
                                "--"+ \
                                prt_code[20:]

        aligned_seq = aligned_dict[protein]
        actual,_ =_map_residue_ind_seq_ind(yaml_file, protein, aligned_seq)
        assert expected[protein] == list(actual.values())

    return
Пример #35
0
def create_equivalent_contact_featurizer(yaml_file, alignment_file,
                                         protein_list=None,
                                         pairs=None,
                                         same_residue=True,
                                         transform=None,
                                         **kwargs):
    """
    Create a equivalent contacts featurizer for a set of proteins
    :param yaml_file: yaml file location
    :param alignment_file: alignment file location
    :param pairs: wanted sequence index positions in the alignment
    You need to just figure out the wanted location for one residue.
    _map_residue_ind_seq_ind function can help with this
    :same residue: True is you would restrict to having the same residue at the same
    sequence position.
    :param kwargs: kwargs for the contact featurizer
    :return: dictionary of contact featurizers. one for each protein
    """
    featurizer_dict={}

    #load alignment file
    yaml_file = load_yaml_file(yaml_file)
    alignment_file = _parse_alignment_file(alignment_file)
    if protein_list is None:
        protein_list = yaml_file["protein_list"]

    if pairs is None:
        #use the max length(probably a horrible idea)
        max_seq_len = max([len(alignment_file[i]) for i in alignment_file.keys()])
        pairs = [i for i in itertools.combinations(range(max_seq_len), 2)]

    for protein in protein_list:
        print(protein)
        #get a list of residues we can keep
        can_keep=[]
        #get mapping and seq
        prt_mapping, prt_seq = _map_residue_ind_seq_ind(yaml_file, protein,
                                                        alignment_file[protein])
        #for wanted positions in the massive wanted indices list
        inv_map = {v: k for k, v in prt_mapping.items()}

        for position in np.unique(pairs):
            #get the
            #get the possible codes at every position
            possible_codes = set([alignment_file[p][position] for p in alignment_file.keys()])
            #if there is not a missing residue

            if not "-" in possible_codes:
                if same_residue and len(set(possible_codes))!=1:
                    continue
                # get the inverse mapping and add it to the list of can keep
                residue_index = inv_map[position]
                can_keep.append(residue_index)
        #sort it because i dont want random bs issues.
        can_keep = np.sort(can_keep)
        #get its pairs
        actual_pairs = np.array([i for i in itertools.combinations(can_keep, 2) if i in pairs])
        if transform=='logistic':
            featurizer_dict[protein] = LogisticContactFeaturizer(contacts=actual_pairs, **kwargs)
        elif transform=='binary':
            featurizer_dict[protein] = BinaryContactFeaturizer(contacts=actual_pairs, **kwargs)
        elif transform is None or transform=="none":
            featurizer_dict[protein] = ContactFeaturizer(contacts=actual_pairs, **kwargs)
        else:
            raise ValueError("type needs to be one of logistic, binary, none")
    return featurizer_dict
Пример #36
0
def create_equivalent_contact_featurizer(yaml_file, alignment_file,
                                         protein_list=None,
                                         pairs=None,
                                         same_residue=True,
                                         transform=None,
                                         **kwargs):
    """
    Create a equivalent contacts featurizer for a set of proteins
    :param yaml_file: yaml file location
    :param alignment_file: alignment file location
    :param pairs: wanted sequence index positions in the alignment
    You need to just figure out the wanted location for one residue.
    _map_residue_ind_seq_ind function can help with this
    :same residue: True is you would restrict to having the same residue at the same
    sequence position.
    :param kwargs: kwargs for the contact featurizer
    :return: dictionary of contact featurizers. one for each protein
    """
    featurizer_dict={}

    #load alignment file
    yaml_file = load_yaml_file(yaml_file)
    alignment_file = _parse_alignment_file(alignment_file)
    if protein_list is None:
        protein_list = yaml_file["protein_list"]

    if pairs is None:
        #use the max length(probably a horrible idea)
        max_seq_len = max([len(alignment_file[i]) for i in alignment_file.keys()])
        pairs = [i for i in itertools.combinations(range(max_seq_len), 2)]

    for protein in protein_list:
        print(protein)
        #get a list of residues we can keep
        can_keep=[]
        #get mapping and seq
        prt_mapping, prt_seq = _map_residue_ind_seq_ind(yaml_file, protein,
                                                        alignment_file[protein])
        #for wanted positions in the massive wanted indices list
        inv_map = {v: k for k, v in prt_mapping.items()}

        for position in np.unique(pairs):
            #get the
            #get the possible codes at every position
            possible_codes = set([alignment_file[p][position] for p in alignment_file.keys()])
            #if there is not a missing residue

            if not "-" in possible_codes:
                if same_residue and len(set(possible_codes))!=1:
                    continue
                # get the inverse mapping and add it to the list of can keep
                residue_index = inv_map[position]
                can_keep.append(residue_index)
        #sort it because i dont want random bs issues.
        can_keep = np.sort(can_keep)
        #get its pairs
        actual_pairs = np.array([i for i in itertools.combinations(can_keep, 2) if i in pairs])
        if transform=='logistic':
            featurizer_dict[protein] = LogisticContactFeaturizer(contacts=actual_pairs, **kwargs)
        elif transform=='binary':
            featurizer_dict[protein] = BinaryContactFeaturizer(contacts=actual_pairs, **kwargs)
        elif transform is None or transform=="none":
            featurizer_dict[protein] = ContactFeaturizer(contacts=actual_pairs, **kwargs)
        else:
            raise ValueError("type needs to be one of logistic, binary, none")
    return featurizer_dict
Пример #37
0
def test_convert_project():

    print(base_dir)
    pool = Pool(6)
    yaml_file = load_yaml_file(
        os.path.join(base_dir, "mdl_dir", "project.yaml"))

    def test_hdf5(protein, p, r, clone):
        trj, stripped_trj = _load_project_clone(protein, p, r, clone)
        trj2 = mdt.load(
            os.path.join(base_dir, protein,
                         "trajectories/%s_%d_0.hdf5" % (p, r)))
        trj3 = mdt.load(
            os.path.join(base_dir, protein,
                         "protein_traj/%s_%d_0.hdf5" % (p, r)))

        for i in ["top", "n_atoms", "n_chains", "n_frames", "n_residues"]:
            assert getattr(trj, i) == getattr(trj2, i)

        for i in ["top", "n_atoms", "n_chains", "n_frames", "n_residues"]:
            assert getattr(stripped_trj, i) == getattr(trj3, i)

        assert (trj.xyz == trj2.xyz).all()
        assert (stripped_trj.xyz == trj3.xyz).all()

        return True

    def test_stripped_hdf5(protein, p, r, clone):
        trj, stripped_trj = _load_project_clone(protein, p, r, clone)
        trj3 = mdt.load(
            os.path.join(base_dir, protein,
                         "protein_traj/%s_%d_0.hdf5" % (p, r)))

        for i in ["top", "n_atoms", "n_chains", "n_frames", "n_residues"]:
            assert getattr(stripped_trj, i) == getattr(trj3, i)

        assert (stripped_trj.xyz == trj3.xyz).all()

        return True

    def test_hdf5_file_validation():
        """
        Kinase1/RUN1/CLONE0 has a missing file results-001.tar.bz2. We
        make sure that that hdf5 has the first results-000.tar.bz2
        but not 002. This is a hardcoded test that is not really desirable
        """
        trj = HDF5TrajectoryFile(
            os.path.join(base_dir, "kinase_1", "trajectories",
                         "fake_proj1_1_0.hdf5"))
        flist = trj._handle.root.processed_filenames
        fpath, fname = os.path.split(flist[0])

        return os.path.join(fpath,six.b("results-000.tar.bz2")) in flist and \
               os.path.join(fpath,six.b("results-002.tar.bz2")) not in flist

    def test_non_contingous():
        """
        Kinase2/fake_proj3/RUN1/ has two clones Clone 0 and Clone 2
        we make sure that the naming convention is correct
        """
        assert os.path.isfile(
            os.path.join(base_dir, "kinase_2", "protein_traj",
                         "fake_proj3_1_0.hdf5"))

        assert not os.path.isfile(
            os.path.join(base_dir, "kinase_2", "protein_traj",
                         "fake_proj3_1_1.hdf5"))

        assert os.path.isfile(
            os.path.join(base_dir, "kinase_2", "protein_traj",
                         "fake_proj3_1_2.hdf5"))
        return True

    for i in range(3):
        #extract the project multiple times to see what happens
        extract_project_wrapper(yaml_file, "kinase_1", "fake_proj1", pool)
        extract_project_wrapper(yaml_file, "kinase_1", "fake_proj2", pool)

        assert test_hdf5("kinase_1", "fake_proj1", 0, 0)

        assert test_hdf5_file_validation()

        assert test_hdf5("kinase_1", "fake_proj2", 0, 0)

        #do it for the second project too.
        extract_project_wrapper(yaml_file,
                                "kinase_2",
                                "fake_proj3",
                                pool,
                                protein_only=True)
        assert test_stripped_hdf5("kinase_2", "fake_proj3", 0, 0)
        assert test_non_contingous()

    return True
Пример #38
0
def test_convert_project():

    print(base_dir)
    pool = Pool(6)
    yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml"))

    def test_hdf5(protein, p, r, clone):
        trj, stripped_trj = _load_project_clone(protein, p, r, clone)
        trj2 = mdt.load(os.path.join(base_dir, protein, "trajectories/%s_%d_0.hdf5"%(p,r)))
        trj3 = mdt.load(os.path.join(base_dir, protein,"protein_traj/%s_%d_0.hdf5"%(p,r)))

        for i in ["top", "n_atoms", "n_chains", "n_frames", "n_residues"]:
            assert getattr(trj, i) == getattr(trj2,i)

        for i in ["top", "n_atoms", "n_chains", "n_frames", "n_residues"]:
            assert getattr(stripped_trj, i) == getattr(trj3,i)

        assert (trj.xyz==trj2.xyz).all()
        assert (stripped_trj.xyz==trj3.xyz).all()

        return True

    def test_stripped_hdf5(protein, p, r, clone):
        trj, stripped_trj = _load_project_clone(protein, p, r, clone)
        trj3 = mdt.load(os.path.join(base_dir, protein,"protein_traj/%s_%d_0.hdf5"%(p,r)))


        for i in ["top", "n_atoms", "n_chains", "n_frames", "n_residues"]:
            assert getattr(stripped_trj, i) == getattr(trj3,i)

        assert (stripped_trj.xyz==trj3.xyz).all()

        return True

    def test_hdf5_file_validation():
        """
        Kinase1/RUN1/CLONE0 has a missing file results-001.tar.bz2. We
        make sure that that hdf5 has the first results-000.tar.bz2
        but not 002. This is a hardcoded test that is not really desirable
        """
        trj = HDF5TrajectoryFile(os.path.join(base_dir,"kinase_1",
                                              "trajectories","fake_proj1_1_0.hdf5"))
        flist=trj._handle.root.processed_filenames
        fpath, fname =  os.path.split(flist[0])

        return os.path.join(fpath,six.b("results-000.tar.bz2")) in flist and \
               os.path.join(fpath,six.b("results-002.tar.bz2")) not in flist

    def test_non_contingous():
        """
        Kinase2/fake_proj3/RUN1/ has two clones Clone 0 and Clone 2
        we make sure that the naming convention is correct
        """
        assert os.path.isfile(os.path.join(base_dir,"kinase_2",
                                              "protein_traj",
                                              "fake_proj3_1_0.hdf5"))

        assert not os.path.isfile(os.path.join(base_dir,"kinase_2",
                                              "protein_traj",
                                              "fake_proj3_1_1.hdf5"))

        assert os.path.isfile(os.path.join(base_dir,"kinase_2",
                                              "protein_traj",
                                              "fake_proj3_1_2.hdf5"))
        return True

    for i in range(3):
        #extract the project multiple times to see what happens
        extract_project_wrapper(yaml_file, "kinase_1", "fake_proj1", pool)
        extract_project_wrapper(yaml_file, "kinase_1", "fake_proj2", pool)

        assert test_hdf5("kinase_1", "fake_proj1", 0, 0)

        assert test_hdf5_file_validation()

        assert test_hdf5("kinase_1", "fake_proj2", 0, 0)

        #do it for the second project too.
        extract_project_wrapper(yaml_file, "kinase_2", "fake_proj3", pool,
                protein_only=True)
        assert test_stripped_hdf5("kinase_2", "fake_proj3", 0, 0)
        assert test_non_contingous()

    return True