def test_get_common_features():
    yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml"))
    aligned_dict={}
    for protein in yaml_file["protein_list"]:
        t = load_random_traj(yaml_file, protein)
        aligned_dict[protein] = t.top.to_fasta(chain=0)

    f= DihedralFeaturizer()
    common_feature_dic,_ = _get_common_features(yaml_file,f, aligned_dict, False)
    for protein in yaml_file["protein_list"]:
        t = load_random_traj(yaml_file, protein)
        assert(len(common_feature_dic[protein])==f.transform(t)[0].shape[1])

    return
def test_get_common_features_2():
    yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml"))
    aligned_dict={}
    for protein in yaml_file["protein_list"]:
        t = load_random_traj(yaml_file, protein)
        aligned_dict[protein] = t.top.to_fasta(chain=0)

    f= DihedralFeaturizer(types=['phi','psi','chi1'])
    common_feature_dic,_ = _get_common_features(yaml_file,f, aligned_dict, False)
    assert(len(set([len(common_feature_dic[i]) for i in yaml_file["protein_list"]]))==1)
    return
def test_present_for_all_same_seq():
    yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml"))
    aligned_dict={}
    for protein in yaml_file["protein_list"]:
        t = load_random_traj(yaml_file, protein)
        aligned_dict[protein] = t.top.to_fasta(chain=0)

    for protein in yaml_file["protein_list"]:
        aligned_seq = aligned_dict[protein]
        prt_mapping, prt_seq =_map_residue_ind_seq_ind(yaml_file, protein, aligned_seq)
        assert(len(_present_for_all(protein, prt_mapping, prt_seq, aligned_dict))==len(prt_seq))
    return
def test_get_common_residues():
    yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml"))
    aligned_dict={}
    for protein in yaml_file["protein_list"]:
        t = load_random_traj(yaml_file, protein)
        aligned_dict[protein] = t.top.to_fasta(chain=0)

    res_dic,prt_seq =  _get_common_residues(yaml_file, aligned_dict)
    for protein in yaml_file["protein_list"]:
        print(len(res_dic[protein]),t.n_residues)
        assert(len(res_dic[protein])==len(t.top.to_fasta(chain=0)))

    return
def test_map_residue_seq_with_insert():
    yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml"))
    aligned_dict={}
    for protein in yaml_file["protein_list"]:
        expected = {}

        t = load_random_traj(yaml_file, protein)

        expected[protein] = [i.index+3 for i in t.top.residues if i.is_protein]
        aligned_dict[protein] = "---"+ t.top.to_fasta(chain=0)
        aligned_seq = aligned_dict[protein]
        actual,_ =_map_residue_ind_seq_ind(yaml_file, protein, aligned_seq)
        assert expected[protein] == list(actual.values())

    return
def test_map_residue_seq_with_insert_at_end():
    yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml"))
    aligned_dict={}
    for protein in yaml_file["protein_list"]:
        expected = {}

        t = load_random_traj(yaml_file, protein)
        #add an insertion AFTER 10 residues. We expect all but the 10 have

        expected[protein] = [i for i in range(t.n_residues) if t.top.residue(i).code is not None]

        aligned_dict[protein] = t.top.to_fasta(chain=0)+"---"

        aligned_seq = aligned_dict[protein]
        actual,_ =_map_residue_ind_seq_ind(yaml_file, protein, aligned_seq)

        assert expected[protein] == list(actual.values())
    return
def _map_residue_ind_seq_ind(yaml_file, protein, aligned_seq):
    trj = load_random_traj(yaml_file, protein)
    mapping = {}
    seq_index = 0
    prt_seq = trj.top.to_fasta(chain=0)
    #test to make sure the alignment sequence matches with the protein sequence.
    #get rid of _ from the alignment to account for additions/deletions.
    assert(prt_seq==''.join([i for i in aligned_seq if i!="-"]))
    for i in [i.index for i in trj.top.residues if i.is_protein]:
        while True:
            if trj.top.residue(i).code == aligned_seq[seq_index]:
                mapping[i] = seq_index
                seq_index += 1
                break
            else:
                seq_index += 1
                continue

    return mapping, prt_seq
示例#8
0
def _map_residue_ind_seq_ind(yaml_file, protein, aligned_seq, trj=None):
    if trj is None:
        trj = load_random_traj(yaml_file, protein)
    mapping = {}
    seq_index = 0
    prt_seq = ''.join([i.code for i in trj.top.residues if i.is_protein])
    #test to make sure the alignment sequence matches with the protein sequence.
    #get rid of _ from the alignment to account for additions/deletions.
    assert(prt_seq==''.join([i for i in aligned_seq if i!="-"]))
    for i in [i.index for i in trj.top.residues if i.is_protein]:
        while True:
            if trj.top.residue(i).code == aligned_seq[seq_index]:
                mapping[i] = seq_index
                seq_index += 1
                break
            else:
                seq_index += 1
                continue

    return mapping, prt_seq
def test_map_residue_seq_with_two_inserts():
    yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml"))
    aligned_dict={}
    for protein in yaml_file["protein_list"]:
        expected = {}

        t = load_random_traj(yaml_file, protein)
        #add an insertion AFTER 10 residues. and then again at 20
        expected[protein] = [i for i in range(10) if t.top.residue(i).code is not None] + \
                            [i+3 for i in range(10, 20)  if t.top.residue(i).code is not None]+\
                            [i+5 for i in range(20, t.n_residues) if t.top.residue(i).code is not None]

        prt_code = t.top.to_fasta(chain=0)
        aligned_dict[protein] = prt_code[:10]+\
                                "---"+ \
                                prt_code[10:20]+\
                                "--"+ \
                                prt_code[20:]

        aligned_seq = aligned_dict[protein]
        actual,_ =_map_residue_ind_seq_ind(yaml_file, protein, aligned_seq)
        assert expected[protein] == list(actual.values())

    return
示例#10
0
def _get_common_features(yaml_file, featurizer, aligned_dict,save_df=True):
    """
    Function to get the common features across protein using the common residues.
    can optionally save the pandas data to the mdl_dir
    :param yaml_file: The protein yaml_file
    :param featurizer: featurizer object used.
    :param prt_mapping: Mapping of each residue to its sequence
    :param aligned_dict : Dictionary of alignments for each protein
    :return:
    """
    result_dict = {}
    df_dict={}
    for protein in yaml_file["protein_list"]:
        print(protein)
        #reset the featurizer
        featurizer = clone(featurizer)
        trj = load_random_traj(yaml_file, protein)
        df = pd.DataFrame(featurizer.describe_features(trj))
        prt_mapping, prt_seq = _map_residue_ind_seq_ind(yaml_file, protein,
                                                        aligned_dict[protein])
        feature_vec =[]
        #for every feature
        for i in df.iterrows():
            #get the index and the feature itself
            feature_ind, feature_dict = i
            all_res_in_algn = []
            mapped_index_list=[]
            for aa_ind in feature_dict["resids"]:
                aa_code = prt_seq[aa_ind]
                #make sure we have the same residue
                assert(trj.top.residue(aa_ind).code==aa_code)
                #get the mapping for that aa to the main alignment
                mapped_index = prt_mapping[aa_ind]
                #for every protein in the alignment, check if we have the same residue
                #at the same position
                all_res_in_algn.append(np.alltrue([aligned_dict[prt][mapped_index]==aa_code
                                          for prt in yaml_file["protein_list"]]))
                mapped_index_list.append(mapped_index)


            #to account for additions and deletions, we check if the difference between
            #the mapping and the actual residue codes is the same.
            mapped_index_difference = [x - mapped_index_list[i - 1]
                                       for i, x in enumerate(mapped_index_list) if i > 0]
            resid_index_difference = [x - feature_dict["resids"][i - 1]
                                       for i, x in enumerate(feature_dict["resids"]) if i > 0]
            if not np.all(mapped_index_difference==resid_index_difference):
                all_res_in_algn.append(False)


            if np.alltrue(all_res_in_algn):
                feature_vec.append(feature_ind)

        df_dict[protein] = df.iloc[feature_vec]
        result_dict[protein] = feature_vec

        if save_df:
            new_df = df.iloc[feature_vec]
            with enter_protein_mdl_dir(yaml_file, protein):
                verbosedump(new_df, os.path.join("feature_descriptor.h5"))
            with enter_protein_data_dir(yaml_file, protein):
                verbosedump(new_df, os.path.join("sliced_feature_dir",
                                                 "feature_descriptor.h5"))
    return result_dict, df_dict
示例#11
0
def _get_common_features(yaml_file, featurizer, aligned_dict,save_df=True):
    """
    Function to get the common features across protein using the common residues.
    can optionally save the pandas data to the mdl_dir
    :param yaml_file: The protein yaml_file
    :param featurizer: featurizer object used.
    :param prt_mapping: Mapping of each residue to its sequence
    :param aligned_dict : Dictionary of alignments for each protein
    :return:
    """
    result_dict = {}
    df_dict={}
    for protein in yaml_file["protein_list"]:
        print(protein)
        #reset the featurizer
        featurizer = clone(featurizer)
        trj = load_random_traj(yaml_file, protein)
        df = pd.DataFrame(featurizer.describe_features(trj))
        prt_mapping, prt_seq = _map_residue_ind_seq_ind(yaml_file, protein,
                                                        aligned_dict[protein])
        feature_vec =[]
        #for every feature
        for i in df.iterrows():
            #get the index and the feature itself
            feature_ind, feature_dict = i
            all_res_in_algn = []
            mapped_index_list=[]
            for aa_ind in feature_dict["resids"]:
                aa_code = prt_seq[aa_ind]
                #make sure we have the same residue
                assert(trj.top.residue(aa_ind).code==aa_code)
                #get the mapping for that aa to the main alignment
                mapped_index = prt_mapping[aa_ind]
                #for every protein in the alignment, check if we have the same residue
                #at the same position
                all_res_in_algn.append(np.alltrue([aligned_dict[prt][mapped_index]==aa_code
                                          for prt in yaml_file["protein_list"]]))
                mapped_index_list.append(mapped_index)


            #to account for additions and deletions, we check if the difference between
            #the mapping and the actual residue codes is the same.
            mapped_index_difference = [x - mapped_index_list[i - 1]
                                       for i, x in enumerate(mapped_index_list) if i > 0]
            resid_index_difference = [x - feature_dict["resids"][i - 1]
                                       for i, x in enumerate(feature_dict["resids"]) if i > 0]
            if not np.all(mapped_index_difference==resid_index_difference):
                all_res_in_algn.append(False)


            if np.alltrue(all_res_in_algn):
                feature_vec.append(feature_ind)

        df_dict[protein] = df.iloc[feature_vec]
        result_dict[protein] = feature_vec

        if save_df:
            new_df = df.iloc[feature_vec]
            with enter_protein_mdl_dir(yaml_file, protein):
                verbosedump(new_df, os.path.join("feature_descriptor.h5"))
            with enter_protein_data_dir(yaml_file, protein):
                verbosedump(new_df, os.path.join("sliced_feature_dir",
                                                 "feature_descriptor.h5"))
    return result_dict, df_dict