def test_get_common_features(): yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml")) aligned_dict={} for protein in yaml_file["protein_list"]: t = load_random_traj(yaml_file, protein) aligned_dict[protein] = t.top.to_fasta(chain=0) f= DihedralFeaturizer() common_feature_dic,_ = _get_common_features(yaml_file,f, aligned_dict, False) for protein in yaml_file["protein_list"]: t = load_random_traj(yaml_file, protein) assert(len(common_feature_dic[protein])==f.transform(t)[0].shape[1]) return
def test_get_common_features_2(): yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml")) aligned_dict={} for protein in yaml_file["protein_list"]: t = load_random_traj(yaml_file, protein) aligned_dict[protein] = t.top.to_fasta(chain=0) f= DihedralFeaturizer(types=['phi','psi','chi1']) common_feature_dic,_ = _get_common_features(yaml_file,f, aligned_dict, False) assert(len(set([len(common_feature_dic[i]) for i in yaml_file["protein_list"]]))==1) return
def test_present_for_all_same_seq(): yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml")) aligned_dict={} for protein in yaml_file["protein_list"]: t = load_random_traj(yaml_file, protein) aligned_dict[protein] = t.top.to_fasta(chain=0) for protein in yaml_file["protein_list"]: aligned_seq = aligned_dict[protein] prt_mapping, prt_seq =_map_residue_ind_seq_ind(yaml_file, protein, aligned_seq) assert(len(_present_for_all(protein, prt_mapping, prt_seq, aligned_dict))==len(prt_seq)) return
def test_get_common_residues(): yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml")) aligned_dict={} for protein in yaml_file["protein_list"]: t = load_random_traj(yaml_file, protein) aligned_dict[protein] = t.top.to_fasta(chain=0) res_dic,prt_seq = _get_common_residues(yaml_file, aligned_dict) for protein in yaml_file["protein_list"]: print(len(res_dic[protein]),t.n_residues) assert(len(res_dic[protein])==len(t.top.to_fasta(chain=0))) return
def test_map_residue_seq_with_insert(): yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml")) aligned_dict={} for protein in yaml_file["protein_list"]: expected = {} t = load_random_traj(yaml_file, protein) expected[protein] = [i.index+3 for i in t.top.residues if i.is_protein] aligned_dict[protein] = "---"+ t.top.to_fasta(chain=0) aligned_seq = aligned_dict[protein] actual,_ =_map_residue_ind_seq_ind(yaml_file, protein, aligned_seq) assert expected[protein] == list(actual.values()) return
def test_map_residue_seq_with_insert_at_end(): yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml")) aligned_dict={} for protein in yaml_file["protein_list"]: expected = {} t = load_random_traj(yaml_file, protein) #add an insertion AFTER 10 residues. We expect all but the 10 have expected[protein] = [i for i in range(t.n_residues) if t.top.residue(i).code is not None] aligned_dict[protein] = t.top.to_fasta(chain=0)+"---" aligned_seq = aligned_dict[protein] actual,_ =_map_residue_ind_seq_ind(yaml_file, protein, aligned_seq) assert expected[protein] == list(actual.values()) return
def _map_residue_ind_seq_ind(yaml_file, protein, aligned_seq): trj = load_random_traj(yaml_file, protein) mapping = {} seq_index = 0 prt_seq = trj.top.to_fasta(chain=0) #test to make sure the alignment sequence matches with the protein sequence. #get rid of _ from the alignment to account for additions/deletions. assert(prt_seq==''.join([i for i in aligned_seq if i!="-"])) for i in [i.index for i in trj.top.residues if i.is_protein]: while True: if trj.top.residue(i).code == aligned_seq[seq_index]: mapping[i] = seq_index seq_index += 1 break else: seq_index += 1 continue return mapping, prt_seq
def _map_residue_ind_seq_ind(yaml_file, protein, aligned_seq, trj=None): if trj is None: trj = load_random_traj(yaml_file, protein) mapping = {} seq_index = 0 prt_seq = ''.join([i.code for i in trj.top.residues if i.is_protein]) #test to make sure the alignment sequence matches with the protein sequence. #get rid of _ from the alignment to account for additions/deletions. assert(prt_seq==''.join([i for i in aligned_seq if i!="-"])) for i in [i.index for i in trj.top.residues if i.is_protein]: while True: if trj.top.residue(i).code == aligned_seq[seq_index]: mapping[i] = seq_index seq_index += 1 break else: seq_index += 1 continue return mapping, prt_seq
def test_map_residue_seq_with_two_inserts(): yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml")) aligned_dict={} for protein in yaml_file["protein_list"]: expected = {} t = load_random_traj(yaml_file, protein) #add an insertion AFTER 10 residues. and then again at 20 expected[protein] = [i for i in range(10) if t.top.residue(i).code is not None] + \ [i+3 for i in range(10, 20) if t.top.residue(i).code is not None]+\ [i+5 for i in range(20, t.n_residues) if t.top.residue(i).code is not None] prt_code = t.top.to_fasta(chain=0) aligned_dict[protein] = prt_code[:10]+\ "---"+ \ prt_code[10:20]+\ "--"+ \ prt_code[20:] aligned_seq = aligned_dict[protein] actual,_ =_map_residue_ind_seq_ind(yaml_file, protein, aligned_seq) assert expected[protein] == list(actual.values()) return
def _get_common_features(yaml_file, featurizer, aligned_dict,save_df=True): """ Function to get the common features across protein using the common residues. can optionally save the pandas data to the mdl_dir :param yaml_file: The protein yaml_file :param featurizer: featurizer object used. :param prt_mapping: Mapping of each residue to its sequence :param aligned_dict : Dictionary of alignments for each protein :return: """ result_dict = {} df_dict={} for protein in yaml_file["protein_list"]: print(protein) #reset the featurizer featurizer = clone(featurizer) trj = load_random_traj(yaml_file, protein) df = pd.DataFrame(featurizer.describe_features(trj)) prt_mapping, prt_seq = _map_residue_ind_seq_ind(yaml_file, protein, aligned_dict[protein]) feature_vec =[] #for every feature for i in df.iterrows(): #get the index and the feature itself feature_ind, feature_dict = i all_res_in_algn = [] mapped_index_list=[] for aa_ind in feature_dict["resids"]: aa_code = prt_seq[aa_ind] #make sure we have the same residue assert(trj.top.residue(aa_ind).code==aa_code) #get the mapping for that aa to the main alignment mapped_index = prt_mapping[aa_ind] #for every protein in the alignment, check if we have the same residue #at the same position all_res_in_algn.append(np.alltrue([aligned_dict[prt][mapped_index]==aa_code for prt in yaml_file["protein_list"]])) mapped_index_list.append(mapped_index) #to account for additions and deletions, we check if the difference between #the mapping and the actual residue codes is the same. mapped_index_difference = [x - mapped_index_list[i - 1] for i, x in enumerate(mapped_index_list) if i > 0] resid_index_difference = [x - feature_dict["resids"][i - 1] for i, x in enumerate(feature_dict["resids"]) if i > 0] if not np.all(mapped_index_difference==resid_index_difference): all_res_in_algn.append(False) if np.alltrue(all_res_in_algn): feature_vec.append(feature_ind) df_dict[protein] = df.iloc[feature_vec] result_dict[protein] = feature_vec if save_df: new_df = df.iloc[feature_vec] with enter_protein_mdl_dir(yaml_file, protein): verbosedump(new_df, os.path.join("feature_descriptor.h5")) with enter_protein_data_dir(yaml_file, protein): verbosedump(new_df, os.path.join("sliced_feature_dir", "feature_descriptor.h5")) return result_dict, df_dict