def _extract_data(self, traj): contact = ContactFeaturizer( contacts=self.contacts, scheme=self.scheme, ignore_nonprotein=self.ignore_nonprotein ) distances = contact.partial_transform(traj) summary = contact.describe_features(traj) pairs = [item["resids"] for item in summary] resids = np.unique(pairs) data = [] for resid in resids: idx = list(list(set(pair) - {resid})[0] for pair in pairs if resid in pair) mapping = np.array([True if resid in pair else False for pair in pairs]) data.append(pd.DataFrame(distances[:, mapping], columns=[idx, len(idx) * [resid]])) return pd.concat(data, axis=1)
def test_ContactFeaturizer_describe_features(): scheme = np.random.choice(['ca','closest','closest-heavy']) feat = ContactFeaturizer(scheme=scheme, ignore_nonprotein=True) rnd_traj = np.random.randint(len(trajectories)) features = feat.transform([trajectories[rnd_traj]]) df = pd.DataFrame(feat.describe_features(trajectories[rnd_traj])) for f in range(25): f_index = np.random.choice(len(df)) residue_ind = df.iloc[f_index].resids feature_value, _ = md.compute_contacts(trajectories[rnd_traj], contacts=[residue_ind], scheme=scheme) assert (features[0][:, f_index] == feature_value.flatten()).all()
def Get_contacts_features_villin(): import os import shutil import mdtraj as md os.chdir('/homes/anuginueni/traj_villin') if(os.path.isdir('./contacts')): shutil.rmtree('./contacts') from msmbuilder.dataset import dataset xyz = dataset( "/homes/anuginueni/traj_villin/*.xtc",topology='/homes/anuginueni/traj_villin/filtered.pdb',stride=5) t=md.load( "/homes/anuginueni/traj_villin/trajectory-331.xtc",top='/homes/anuginueni/traj_villin/filtered.pdb',stride=5) from msmbuilder.featurizer import ContactFeaturizer #for contacts featurizer = ContactFeaturizer(scheme='ca') #for contacts des_feat=featurizer.describe_features(t) res = [ sub['resids'] for sub in des_feat ] print(str(res)) contacts = xyz.fit_transform_with(featurizer, 'contacts/', fmt='dir-npy') #for contacts return contacts
def _extract_data(self, traj): contact = ContactFeaturizer(contacts=self.contacts, scheme=self.scheme, ignore_nonprotein=self.ignore_nonprotein) distances = contact.partial_transform(traj) summary = contact.describe_features(traj) pairs = [item['resids'] for item in summary] resids = np.unique(pairs) data = [] for resid in resids: idx = list( list(set(pair) - {resid})[0] for pair in pairs if resid in pair) mapping = np.array( [True if resid in pair else False for pair in pairs]) data.append( pd.DataFrame(distances[:, mapping], columns=[idx, len(idx) * [resid]])) return pd.concat(data, axis=1)
def test_distance_to_logistic(): trajectories = MinimalFsPeptide().get_cached().trajectories steepness = np.absolute(10 * np.random.randn()) center = np.absolute(np.random.randn()) contactfeaturizer = ContactFeaturizer() contacts = contactfeaturizer.transform(trajectories) logisticcontactfeaturizer = LogisticContactFeaturizer(center=center, steepness=steepness) logistics = logisticcontactfeaturizer.transform(trajectories) for n in range(10): i = np.random.randint(0, contacts[0].shape[0] - 1) j = np.random.randint(0, contacts[0].shape[1] - 1) x = contacts[0][i][j] y = logistics[0][i][j] if x > center: assert y < 0.5 if x < center: assert y > 0.5
def test_soft_min_contact_featurizer(): # just get one frame for now traj = MinimalFsPeptide().get_cached().trajectories[0][0] soft_min_beta = 20 ri, rj = np.random.choice( np.arange(traj.top.n_residues), size=2, replace=False) aind_i = [i.index for i in traj.top.residue(ri).atoms] aind_j = [i.index for i in traj.top.residue(rj).atoms] atom_pairs = [i for i in itertools.product(aind_i, aind_j)] featuizer = ContactFeaturizer(contacts=[[ri, rj]], scheme='closest', soft_min=True, soft_min_beta=soft_min_beta) features = featuizer.transform(([traj]))[0] distances = md.compute_distances(traj, atom_pairs) distances = soft_min_beta / \ np.log(np.sum(np.exp(soft_min_beta / distances), axis=1)) np.allclose(features, distances)
def test_distance_to_logistic(): trajectories = MinimalFsPeptide().get_cached().trajectories steepness = np.absolute(10 * np.random.randn()) center = np.absolute(np.random.randn()) contactfeaturizer = ContactFeaturizer() contacts = contactfeaturizer.transform(trajectories) logisticcontactfeaturizer = LogisticContactFeaturizer(center=center, steepness=steepness) logistics = logisticcontactfeaturizer.transform(trajectories) for n in range(10): i = np.random.randint(0, contacts[0].shape[0] - 1) j = np.random.randint(0, contacts[0].shape[1] - 1) x = contacts[0][i][j] y = logistics[0][i][j] if (x > center): assert y < 0.5 if (x < center): assert y > 0.5
def main(): import argparse, textwrap parser = argparse.ArgumentParser( usage=textwrap.dedent( '''Use "python %(prog)s -h" for more information.'''), formatter_class=argparse.RawTextHelpFormatter) parser.add_argument( 'pdbpath', help=textwrap.dedent('''[required] Path to pdb trajectories.''')) parser.add_argument('target', help=textwrap.dedent('''[required] Path to target pdb. Note: The target pdb should have the same number of atoms in structure with that in pdb trajectories. ''' )) args = parser.parse_args() from msmbuilder.dataset import dataset coords = dataset(args.pdbpath) print '%i trajectories found. ' % len(coords) ## featurize features = featurize_trajectories(coords, 'ContactFeaturizer') #print "features: (n_samples, n_features) = (%i, %i) for each trajectory \n" % (features[0].shape[0], features[0].shape[1]) import mdtraj as md target = md.load(args.target) native_contact_dists, native_contact_pairs = md.compute_contacts( target, scheme='ca') native_contact_pairs = native_contact_pairs[np.where( native_contact_dists[0] <= 0.75)] n_native_contact = len(native_contact_pairs) print "Target structure has %i pairs of CA-CA contact in total. \n" % n_native_contact from msmbuilder.featurizer import ContactFeaturizer native_contact_to_target = np.concatenate( ContactFeaturizer( contacts=native_contact_pairs, scheme='ca').fit_transform(coords)) # (n_samples, n_pairs) native_contact_to_target = np.select( [native_contact_to_target <= 0.75, native_contact_to_target > 0.75], [1, 0]) native_contact_to_target = np.sum(native_contact_to_target, axis=1) with open( '%s.%s.number_native_contact.dat' % (get_basename_no_ext(args.target), get_basename_no_ext(args.pdbpath)), 'w') as f: for e in native_contact_to_target: print >> f, '%i %i %.3f' % (n_native_contact, e, e * 1. / n_native_contact)
def test_FeatureSelector_describe_features(): rnd_traj = np.random.randint(len(trajectories)) f_ca = ContactFeaturizer(scheme='CA', ignore_nonprotein=True) f1 = f_ca.transform([trajectories[rnd_traj]]) df1 = pd.DataFrame(f_ca.describe_features(trajectories[rnd_traj])) f_dih = DihedralFeaturizer() f2 = f_dih.transform([trajectories[rnd_traj]]) df2 = pd.DataFrame(f_dih.describe_features(trajectories[rnd_traj])) df_dict = {} df_dict["ca"] = df1 df_dict["dih"] = df2 f_comb = FeatureSelector([('ca', f_ca), ('dih', f_dih)]) f3 = f_comb.transform([trajectories[rnd_traj]]) df3 = pd.DataFrame(f_comb.describe_features(trajectories[rnd_traj])) assert len(df3) == len(df1) + len(df2) df4 = pd.concat([df_dict[i] for i in f_comb.feat_list]) # lets randomly compare 40 features for i in np.random.choice(range(len(df3)), 40): for j in df3.columns: assert eq(df3.iloc[i][j], df4.iloc[i][j])
def featurize_trajectories(coords, featurizer): if featurizer == 'RMSDFeaturizer': from msmbuilder.featurizer import RMSDFeaturizer feat = RMSDFeaturizer(reference_traj=coords[0]) elif featurizer == 'DRIDFeaturizer': from msmbuilder.featurizer import DRIDFeaturizer feat = DRIDFeaturizer() elif featurizer == 'ContactFeaturizer': from msmbuilder.featurizer import ContactFeaturizer feat = ContactFeaturizer(scheme='ca') elif featurizer == 'DihedralFeaturizer': from msmbuilder.featurizer import DihedralFeaturizer feat = DihedralFeaturizer(types=['phi', 'psi']) return feat.fit_transform(coords)
def test_soft_min_contact_featurizer(): # just get one frame for now traj = MinimalFsPeptide().get_cached().trajectories[0][0] soft_min_beta = 20 ri, rj = np.random.choice(np.arange(traj.top.n_residues), size=2, replace=False) aind_i = [i.index for i in traj.top.residue(ri).atoms] aind_j = [i.index for i in traj.top.residue(rj).atoms] atom_pairs = [i for i in itertools.product(aind_i, aind_j)] featuizer = ContactFeaturizer(contacts=[[ri, rj]], scheme='closest', soft_min=True, soft_min_beta=soft_min_beta) features = featuizer.transform(([traj]))[0] distances = md.compute_distances(traj, atom_pairs) distances = soft_min_beta / \ np.log(np.sum(np.exp(soft_min_beta / distances), axis=1)) np.allclose(features, distances)
def Get_combined_features_villin(): from msmbuilder.featurizer import DihedralFeaturizer from msmbuilder.featurizer import ContactFeaturizer diheds= DihedralFeaturizer() contacts=ContactFeaturizer() features=[("di_villin",diheds),("con_villin",contacts)] import os import shutil os.chdir('/homes/anuginueni/traj_villin') if(os.path.isdir('/homes/anuginueni/traj_villin/combined')): shutil.rmtree('/homes/anuginueni/traj_villin/combined') from msmbuilder.dataset import dataset xyz = dataset( "/homes/anuginueni/traj_villin/*.xtc",topology='/homes/anuginueni/traj_villin/filtered.pdb',stride=5) from msmbuilder.feature_selection import FeatureSelector comb_features=FeatureSelector(features) co=xyz.fit_transform_with(comb_features, '/homes/anuginueni/traj_villin/combined/', fmt='dir-npy') return co
def featurize_trajectories(coords, featurizer): ''' Input coords : list of 'MDTrajDataset' object Output features : list of arrays, length n_trajs, each of shape (n_samples, n_features) ''' if featurizer == 'RMSDFeaturizer': from msmbuilder.featurizer import RMSDFeaturizer feat = RMSDFeaturizer(reference_traj=coords[0]) elif featurizer == 'DRIDFeaturizer': from msmbuilder.featurizer import DRIDFeaturizer feat = DRIDFeaturizer() elif featurizer == 'ContactFeaturizer': from msmbuilder.featurizer import ContactFeaturizer feat = ContactFeaturizer(scheme='ca') elif featurizer == 'DihedralFeaturizer': from msmbuilder.featurizer import DihedralFeaturizer feat = DihedralFeaturizer(types=['phi', 'psi']) return feat.fit_transform(coords)
def test_contacts(): trajectories = MinimalFsPeptide().get_cached().trajectories contactfeaturizer = ContactFeaturizer() contacts = contactfeaturizer.transform(trajectories) assert contacts[0].shape[1] == 171
def calculate_contact_mat(traj, scheme): from msmbuilder.featurizer import ContactFeaturizer return ContactFeaturizer(scheme=scheme).partial_transform(traj)
def main(): import argparse, textwrap parser = argparse.ArgumentParser( usage=textwrap.dedent( '''Use "python %(prog)s -h" for more information.'''), formatter_class=argparse.RawTextHelpFormatter, description=textwrap.dedent('''\ First, this program employs msmbuilder to featurize given pdb trajectories into vectorizable space. Second, the vector space is decompose by tICA or PCA to further reduce the dimension. Third, clustering is performed so that each structure in the trajectories is labeled by an index. Forth, Marcov State Model, albeit may not be well behaved, is built on the labeled trajectories. Last, FAST reward scores are calculated based on the transition-count matrix and user-chosen physical traits. Example: $ python FAST.py path_to_pdb_trajectories/ --featurizer=DRIDFeaturizer --decomposer=PCA --decomposer-n-components=5 --clusterer=KCenters --n-clusters=5 --msm-prior-counts=0.2 --physical-trait=target-RMSD --target-pdb=/path_to_target_pdb/target.pdb ''' )) parser.add_argument( 'pdbpath', help=textwrap.dedent('''[required] Path to pdb trajectories.''')) parser.add_argument( '--lag-time', default=1, type=int, help=textwrap.dedent('''Lag time of the model. Default value = 1.''')) parser.add_argument('--featurizer', default=None, type=str, help=textwrap.dedent('''\ Featurizer at your choice. Available featurizers are (select them by name): (1) RMSDFeaturizer; (2) DihedralFeaturizer, only phi and psi angles; (3) DRIDFeaturizer (DRID, Distribution of Reciprocal of Interatomic Distances); (4) ContactFeaturizer, CA contact. Note: user must choose a featurization method. Choose by name. ''') ) parser.add_argument('--decomposer', default=None, type=str, help=textwrap.dedent('''\ Decomposer at your choice. Available decomposers are: (1) tICA; (2) PCA. Note: selection of decomposer is not necessary but recommended. If not provided, program will ignore this step and cluster directly on raw features. ''' )) parser.add_argument( '--decomposer-n-components', default=None, type=int, help=textwrap.dedent( '''Number of components to keep. if n_components is not set all components are kept.''' )) parser.add_argument('--clusterer', default=None, type=str, help=textwrap.dedent('''\ Clustering method at your choice. Available clusterer are: (1) KMeans; (2) KCenters; (3) KMedoids; (4) MiniBatchKMeans; (5) MiniBatchKMedoids. Note: user must choose a clusering method. ''')) parser.add_argument( '--n-clusters', default=5, type=int, help=textwrap.dedent( '''The number of clusters to form as well as the number of centroids to generate.''' )) parser.add_argument('--msm-n-timescales', default=None, type=int, help=textwrap.dedent('''\ The number of dynamical timescales to calculate when diagonalizing the transition matrix. If not specified, it will compute n_states - 1. ''')) parser.add_argument('--msm-prior-counts', default=0, type=float, help=textwrap.dedent('''\ Add a number of 'pseudo counts' to each entry in the counts matrix after ergodic trimming. When prior_counts == 0 (default), the assigned transition probability between two states with no observed transitions will be zero, whereas when prior_counts > 0, even this unobserved transitions will be given nonzero probability. ''')) parser.add_argument('--physical-trait', default=None, type=str, help=textwrap.dedent('''\ Physical trait used in calculation of FAST reward score. Available choices are: (1) target-RMSD, if chosen, user must supply a target structure; (2) target-native-contact, if chosen, user must supply a target structure; (3) target-tmscore, if chosen, user must supply the data file containing the TM-scores in column; (4) potential, target free, if chosen, user must supply the data file containing the potentials in column; Note: user must choose a physical trait. ''')) parser.add_argument('--target-pdb', default=None, type=str, help=textwrap.dedent('''\ The target pdb structure. Note: The target pdb should have the same number of atoms in structure with that in pdb trajectories. ''' )) parser.add_argument('--initial-pdb', default=None, type=str, help=textwrap.dedent('''\ The initial pdb structure. Note: The initial pdb should have the same number of atoms in structure with that in pdb trajectories. ''' )) parser.add_argument( '--potential', default=None, type=str, help=textwrap.dedent( '''The potential file corresponding to the pdb trajectories. ''')) parser.add_argument( '--tmscore', default=None, type=str, help=textwrap.dedent( '''The TM-score file corresponding to the pdb trajectories. ''')) parser.add_argument( '--fast-n-simulations', default=30, type=int, help=textwrap.dedent( '''Number of parallel simulations in each round of FAST algorithm. Default value: 30. ''' )) parser.add_argument( '--fast-alpha', default=1., type=float, help=textwrap.dedent('''Number of clusters. Default value: 1.0.''')) parser.add_argument('--output', type=str, default='output', help=textwrap.dedent('''Output file name.''')) args = parser.parse_args() from msmbuilder.dataset import dataset coords = dataset(os.path.join(args.pdbpath, '*.pdb')) print '%i trajectories found. \n' % len(coords) ## featurize features = featurize_trajectories(coords, args.featurizer) print "%s selected" % args.featurizer print "features: (n_samples, n_features) = (%i, %i) for each trajectory \n" % ( features[0].shape[0], features[0].shape[1]) ## decompose if args.decomposer == None: print "No decomposer is selected! Program will directly cluster the raw features. \n" else: features = decompose_features( features, args.decomposer, n_components=args.decomposer_n_components, lag_time=args.lag_time) print "%s selected" % args.decomposer print "features: (n_samples, n_features) = (%i, %i) for each trajectory \n" % ( features[0].shape[0], features[0].shape[1]) ## clustering clst = cluster_features(features, args.clusterer, n_clusters=args.n_clusters) cci = find_cluster_center_indices(features, clst) print "%s selected" % args.clusterer print "Cluster center indices: %s \n" % cci ## build msm #msm = build_msm(clst.labels_, lag_time=args.lag_time, n_timescales=args.msm_n_timescales, prior_counts=args.msm_prior_counts) #print msm, '\n' #print "Transition count matrix: \n %s \n" % msm.countsmat_ #print "Relative population of each state: %s \n" % msm.populations_ ## construct transition count matrix transition_count_mat = calc_transition_count_mat( np.concatenate(clst.labels_), args.n_clusters) print 'Transition count matrix: \n', transition_count_mat #### calculate FAST reward score output_df = pd.DataFrame() output_df['idx'] = cci output_df['#cluster'] = transition_count_mat.diagonal() if args.initial_pdb != None: import mdtraj as md initial = md.load(args.initial_pdb) from msmbuilder.featurizer import RMSDFeaturizer rmsd_to_initial = np.concatenate( RMSDFeaturizer(initial).fit_transform(coords))[:, 0] output_df['iniRMSD'] = rmsd_to_initial[cci] if args.target_pdb != None: import mdtraj as md target = md.load(args.target_pdb) from msmbuilder.featurizer import RMSDFeaturizer rmsd_to_target = np.concatenate( RMSDFeaturizer(target).fit_transform(coords))[:, 0] native_contact_dists, native_contact_pairs = md.compute_contacts( target, scheme='ca') native_contact_pairs = native_contact_pairs[np.where( native_contact_dists[0] <= 0.75)] print "Target structure has %i pairs of CA-CA contact in total. \n" % len( native_contact_pairs) from msmbuilder.featurizer import ContactFeaturizer native_contact_to_target = np.concatenate( ContactFeaturizer( contacts=native_contact_pairs, scheme='ca').fit_transform(coords)) # (n_samples, n_pairs) native_contact_to_target = np.select([ native_contact_to_target <= 0.75, native_contact_to_target > 0.75 ], [1, 0]) native_contact_to_target = np.sum(native_contact_to_target, axis=1) output_df['tarRMSD'] = rmsd_to_target[cci] output_df['#NativeContact'] = native_contact_to_target[cci] if args.potential != None: potential = np.loadtxt(args.potential) output_df['potential'] = potential[cci] if args.tmscore != None: tmscore = np.loadtxt(args.tmscore) output_df['tmscore'] = tmscore[cci] # choose physical trait print "%s is selected in FAST \n" % args.physical_trait if args.physical_trait == 'target-RMSD': if args.target_pdb == None: print "User must provide a target structure! \n" rewards, sims, c = calc_FAST_reward_score( rmsd_to_target, cci, transition_count_mat, alpha=args.fast_alpha, n_simulations=args.fast_n_simulations, minmax='min') elif args.physical_trait == 'target-native-contact': if args.target_pdb == None: print "User must provide a target structure! \n" rewards, sims, c = calc_FAST_reward_score( native_contact_to_target, cci, transition_count_mat, alpha=args.fast_alpha, n_simulations=args.fast_n_simulations, minmax='max') elif args.physical_trait == 'target-tmscore': if args.tmscore == None: print "User must provide a TM-score file corresponding to the pdb trajectories! \n" rewards, sims, c = calc_FAST_reward_score( tmscore, cci, transition_count_mat, alpha=args.fast_alpha, n_simulations=args.fast_n_simulations, minmax='max') elif args.physical_trait == 'potential': if args.potential == None: print "User must provide a potential file corresponding to the pdb trajectories! \n" rewards, sims, c = calc_FAST_reward_score( potential, cci, transition_count_mat, alpha=args.fast_alpha, n_simulations=args.fast_n_simulations, minmax='min') output_df['#Transition'] = c output_df['reward'] = rewards output_df['#sim'] = sims ## output with open(args.output + '.CenterIdx_ClusterSize.dat', 'w') as f: for i in range(args.n_clusters): print >> f, '%6i %6i' % (cci[i], sims[i]) if args.initial_pdb != None: with open(args.output + '.iniRMSD.dat', 'w') as f: for ele in rmsd_to_initial: print >> f, '%8.3f' % ele if args.target_pdb != None: with open(args.output + '.tarRMSD.dat', 'w') as f: for ele in rmsd_to_target: print >> f, '%8.3f' % ele with open(args.output + '.tarNativeContact.dat', 'w') as f: for ele in native_contact_to_target: print >> f, '%8.3f' % ele with open(args.output + '.dat', 'w') as f: print >> f, output_df ## plot if args.target_pdb != None: plot_cluster(X=rmsd_to_target, Y=native_contact_to_target, cluster_center_indices=cci, figname=args.output + '.tarRMSD_tarNativeContact.png', x_label='RMSD to target / nm', y_label='# native contact', xmin=0, xmax=ceil(rmsd_to_target.max(), 0), ymin=0, ymax=ceil(native_contact_to_target.max()), c_map='winter', cc_color='red') if args.initial_pdb != None: plot_cluster(X=rmsd_to_initial, Y=rmsd_to_target, cluster_center_indices=cci, figname=args.output + '.tarRMSD_iniRMSD.png', x_label='RMSD to initial / nm', y_label='RMSD to target / nm', xmin=0, xmax=ceil(rmsd_to_target.max(), 0), ymin=0, ymax=ceil(rmsd_to_initial.max(), 0), c_map='winter', cc_color='red') if args.tmscore != None: plot_cluster(X=tmscore, Y=native_contact_to_target, cluster_center_indices=cci, figname=args.output + '.tmscore_tarNativeContact.png', x_label='TM-score to target', y_label='# native contact', xmin=0, xmax=1, ymin=0, ymax=ceil(native_contact_to_target.max()), c_map='winter', cc_color='red') if args.potential != None: plot_cluster(X=tmscore, Y=potential, cluster_center_indices=cci, figname=args.output + '.tmscore_potential.png', x_label='TM-score to target', y_label='potential', xmin=0, xmax=1, ymin=floor(potential.min()), ymax=ceil(potential.max()), c_map='winter', cc_color='red') if args.potential != None: plot_cluster(X=rmsd_to_target, Y=potential, cluster_center_indices=cci, figname=args.output + '.tarRMSD_potential.png', x_label='RMSD to target / nm', y_label='potential', xmin=0, xmax=ceil(rmsd_to_target.max(), 0), ymin=floor(potential.min()), ymax=ceil(potential.max()), c_map='winter', cc_color='red') if args.decomposer == 'tICA': cat_features = np.concatenate(features) plot_cluster(X=cat_features[:, 0], Y=cat_features[:, 1], cluster_center_indices=cci, figname=args.output + '.tICA_1st_2nd.png', x_label='tIC 1', y_label='tIC 2', xmin=floor(cat_features[:, 0].min()), xmax=ceil(cat_features[:, 0].max()), ymin=floor(cat_features[:, 1].min()), ymax=ceil(cat_features[:, 1].max()), c_map='winter', cc_color='red') elif args.decomposer == 'PCA': cat_features = np.concatenate(features) plot_cluster(X=cat_features[:, 0], Y=cat_features[:, 1], cluster_center_indices=cci, figname=args.output + '.PCA_1st_2nd.png', x_label='PC 1', y_label='PC 2', xmin=floor(cat_features[:, 0].min()), xmax=ceil(cat_features[:, 0].max()), ymin=floor(cat_features[:, 1].min()), ymax=ceil(cat_features[:, 1].max()), c_map='winter', cc_color='red')
def create_equivalent_contact_featurizer(yaml_file, alignment_file, protein_list=None, pairs=None, same_residue=True, transform=None, **kwargs): """ Create a equivalent contacts featurizer for a set of proteins :param yaml_file: yaml file location :param alignment_file: alignment file location :param pairs: wanted sequence index positions in the alignment You need to just figure out the wanted location for one residue. _map_residue_ind_seq_ind function can help with this :same residue: True is you would restrict to having the same residue at the same sequence position. :param kwargs: kwargs for the contact featurizer :return: dictionary of contact featurizers. one for each protein """ featurizer_dict={} #load alignment file yaml_file = load_yaml_file(yaml_file) alignment_file = _parse_alignment_file(alignment_file) if protein_list is None: protein_list = yaml_file["protein_list"] if pairs is None: #use the max length(probably a horrible idea) max_seq_len = max([len(alignment_file[i]) for i in alignment_file.keys()]) pairs = [i for i in itertools.combinations(range(max_seq_len), 2)] for protein in protein_list: print(protein) #get a list of residues we can keep can_keep=[] #get mapping and seq prt_mapping, prt_seq = _map_residue_ind_seq_ind(yaml_file, protein, alignment_file[protein]) #for wanted positions in the massive wanted indices list inv_map = {v: k for k, v in prt_mapping.items()} for position in np.unique(pairs): #get the #get the possible codes at every position possible_codes = set([alignment_file[p][position] for p in alignment_file.keys()]) #if there is not a missing residue if not "-" in possible_codes: if same_residue and len(set(possible_codes))!=1: continue # get the inverse mapping and add it to the list of can keep residue_index = inv_map[position] can_keep.append(residue_index) #sort it because i dont want random bs issues. can_keep = np.sort(can_keep) #get its pairs actual_pairs = np.array([i for i in itertools.combinations(can_keep, 2) if i in pairs]) if transform=='logistic': featurizer_dict[protein] = LogisticContactFeaturizer(contacts=actual_pairs, **kwargs) elif transform=='binary': featurizer_dict[protein] = BinaryContactFeaturizer(contacts=actual_pairs, **kwargs) elif transform is None or transform=="none": featurizer_dict[protein] = ContactFeaturizer(contacts=actual_pairs, **kwargs) else: raise ValueError("type needs to be one of logistic, binary, none") return featurizer_dict