예제 #1
0
파일: base.py 프로젝트: msultan/mdentropy
    def _extract_data(self, traj):
        contact = ContactFeaturizer(
            contacts=self.contacts, scheme=self.scheme, ignore_nonprotein=self.ignore_nonprotein
        )
        distances = contact.partial_transform(traj)
        summary = contact.describe_features(traj)
        pairs = [item["resids"] for item in summary]
        resids = np.unique(pairs)
        data = []
        for resid in resids:
            idx = list(list(set(pair) - {resid})[0] for pair in pairs if resid in pair)
            mapping = np.array([True if resid in pair else False for pair in pairs])
            data.append(pd.DataFrame(distances[:, mapping], columns=[idx, len(idx) * [resid]]))

        return pd.concat(data, axis=1)
def test_ContactFeaturizer_describe_features():
    scheme = np.random.choice(['ca','closest','closest-heavy'])
    feat = ContactFeaturizer(scheme=scheme, ignore_nonprotein=True)
    rnd_traj = np.random.randint(len(trajectories))
    features = feat.transform([trajectories[rnd_traj]])
    df = pd.DataFrame(feat.describe_features(trajectories[rnd_traj]))

    for f in range(25):
        f_index = np.random.choice(len(df))

        residue_ind = df.iloc[f_index].resids
        feature_value, _ = md.compute_contacts(trajectories[rnd_traj],
                                               contacts=[residue_ind],
                                               scheme=scheme)
        assert (features[0][:, f_index] == feature_value.flatten()).all()
예제 #3
0
def Get_contacts_features_villin():
 import os 
 import shutil
 import mdtraj as md
 os.chdir('/homes/anuginueni/traj_villin')
 if(os.path.isdir('./contacts')):  
   shutil.rmtree('./contacts')
 from msmbuilder.dataset import dataset
 xyz = dataset( "/homes/anuginueni/traj_villin/*.xtc",topology='/homes/anuginueni/traj_villin/filtered.pdb',stride=5) 
 t=md.load( "/homes/anuginueni/traj_villin/trajectory-331.xtc",top='/homes/anuginueni/traj_villin/filtered.pdb',stride=5)
 from msmbuilder.featurizer import ContactFeaturizer        #for contacts          

 featurizer = ContactFeaturizer(scheme='ca')       #for contacts
 des_feat=featurizer.describe_features(t)
 res = [ sub['resids'] for sub in des_feat ]
 print(str(res))
 contacts = xyz.fit_transform_with(featurizer, 'contacts/', fmt='dir-npy') #for contacts
 return contacts
예제 #4
0
    def _extract_data(self, traj):
        contact = ContactFeaturizer(contacts=self.contacts,
                                    scheme=self.scheme,
                                    ignore_nonprotein=self.ignore_nonprotein)
        distances = contact.partial_transform(traj)
        summary = contact.describe_features(traj)
        pairs = [item['resids'] for item in summary]
        resids = np.unique(pairs)
        data = []
        for resid in resids:
            idx = list(
                list(set(pair) - {resid})[0] for pair in pairs
                if resid in pair)
            mapping = np.array(
                [True if resid in pair else False for pair in pairs])
            data.append(
                pd.DataFrame(distances[:, mapping],
                             columns=[idx, len(idx) * [resid]]))

        return pd.concat(data, axis=1)
def test_distance_to_logistic():
    trajectories = MinimalFsPeptide().get_cached().trajectories
    steepness = np.absolute(10 * np.random.randn())
    center = np.absolute(np.random.randn())
    contactfeaturizer = ContactFeaturizer()
    contacts = contactfeaturizer.transform(trajectories)
    logisticcontactfeaturizer = LogisticContactFeaturizer(center=center, steepness=steepness)
    logistics = logisticcontactfeaturizer.transform(trajectories)

    for n in range(10):
        i = np.random.randint(0, contacts[0].shape[0] - 1)
        j = np.random.randint(0, contacts[0].shape[1] - 1)

        x = contacts[0][i][j]
        y = logistics[0][i][j]

        if x > center:
            assert y < 0.5
        if x < center:
            assert y > 0.5
def test_soft_min_contact_featurizer():
    # just get one frame for now
    traj = MinimalFsPeptide().get_cached().trajectories[0][0]
    soft_min_beta = 20

    ri, rj = np.random.choice(
        np.arange(traj.top.n_residues), size=2, replace=False)
    aind_i = [i.index for i in traj.top.residue(ri).atoms]
    aind_j = [i.index for i in traj.top.residue(rj).atoms]

    atom_pairs = [i for i in itertools.product(aind_i, aind_j)]

    featuizer = ContactFeaturizer(contacts=[[ri, rj]], scheme='closest', soft_min=True,
                                  soft_min_beta=soft_min_beta)

    features = featuizer.transform(([traj]))[0]
    distances = md.compute_distances(traj, atom_pairs)
    distances = soft_min_beta / \
        np.log(np.sum(np.exp(soft_min_beta / distances), axis=1))

    np.allclose(features, distances)
예제 #7
0
def test_distance_to_logistic():
    trajectories = MinimalFsPeptide().get_cached().trajectories
    steepness = np.absolute(10 * np.random.randn())
    center = np.absolute(np.random.randn())
    contactfeaturizer = ContactFeaturizer()
    contacts = contactfeaturizer.transform(trajectories)
    logisticcontactfeaturizer = LogisticContactFeaturizer(center=center,
                                                          steepness=steepness)
    logistics = logisticcontactfeaturizer.transform(trajectories)

    for n in range(10):
        i = np.random.randint(0, contacts[0].shape[0] - 1)
        j = np.random.randint(0, contacts[0].shape[1] - 1)

        x = contacts[0][i][j]
        y = logistics[0][i][j]

        if (x > center):
            assert y < 0.5
        if (x < center):
            assert y > 0.5
def main():
    import argparse, textwrap
    parser = argparse.ArgumentParser(
        usage=textwrap.dedent(
            '''Use "python %(prog)s -h" for more information.'''),
        formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument(
        'pdbpath',
        help=textwrap.dedent('''[required] Path to pdb trajectories.'''))
    parser.add_argument('target',
                        help=textwrap.dedent('''[required] Path to target pdb.
        Note: The target pdb should have the same number of atoms in structure with that in pdb trajectories. '''
                                             ))
    args = parser.parse_args()

    from msmbuilder.dataset import dataset
    coords = dataset(args.pdbpath)
    print '%i trajectories found. ' % len(coords)

    ## featurize
    features = featurize_trajectories(coords, 'ContactFeaturizer')
    #print "features: (n_samples, n_features) = (%i, %i) for each trajectory \n" % (features[0].shape[0], features[0].shape[1])

    import mdtraj as md
    target = md.load(args.target)

    native_contact_dists, native_contact_pairs = md.compute_contacts(
        target, scheme='ca')
    native_contact_pairs = native_contact_pairs[np.where(
        native_contact_dists[0] <= 0.75)]
    n_native_contact = len(native_contact_pairs)
    print "Target structure has %i pairs of CA-CA contact in total. \n" % n_native_contact

    from msmbuilder.featurizer import ContactFeaturizer
    native_contact_to_target = np.concatenate(
        ContactFeaturizer(
            contacts=native_contact_pairs,
            scheme='ca').fit_transform(coords))  # (n_samples, n_pairs)
    native_contact_to_target = np.select(
        [native_contact_to_target <= 0.75, native_contact_to_target > 0.75],
        [1, 0])
    native_contact_to_target = np.sum(native_contact_to_target, axis=1)

    with open(
            '%s.%s.number_native_contact.dat' %
        (get_basename_no_ext(args.target), get_basename_no_ext(args.pdbpath)),
            'w') as f:
        for e in native_contact_to_target:
            print >> f, '%i %i %.3f' % (n_native_contact, e,
                                        e * 1. / n_native_contact)
def test_FeatureSelector_describe_features():
    rnd_traj = np.random.randint(len(trajectories))
    f_ca = ContactFeaturizer(scheme='CA', ignore_nonprotein=True)
    f1 = f_ca.transform([trajectories[rnd_traj]])
    df1 = pd.DataFrame(f_ca.describe_features(trajectories[rnd_traj]))

    f_dih = DihedralFeaturizer()
    f2 = f_dih.transform([trajectories[rnd_traj]])
    df2 = pd.DataFrame(f_dih.describe_features(trajectories[rnd_traj]))

    df_dict = {}
    df_dict["ca"] = df1
    df_dict["dih"] = df2

    f_comb = FeatureSelector([('ca', f_ca), ('dih', f_dih)])
    f3 = f_comb.transform([trajectories[rnd_traj]])
    df3 = pd.DataFrame(f_comb.describe_features(trajectories[rnd_traj]))
    assert len(df3) == len(df1) + len(df2)
    df4 = pd.concat([df_dict[i] for i in f_comb.feat_list])
    # lets randomly compare 40 features
    for i in np.random.choice(range(len(df3)), 40):
        for j in df3.columns:
            assert eq(df3.iloc[i][j], df4.iloc[i][j])
예제 #10
0
def test_FeatureSelector_describe_features():
    rnd_traj = np.random.randint(len(trajectories))
    f_ca = ContactFeaturizer(scheme='CA', ignore_nonprotein=True)
    f1 = f_ca.transform([trajectories[rnd_traj]])
    df1 = pd.DataFrame(f_ca.describe_features(trajectories[rnd_traj]))

    f_dih = DihedralFeaturizer()
    f2 = f_dih.transform([trajectories[rnd_traj]])
    df2 = pd.DataFrame(f_dih.describe_features(trajectories[rnd_traj]))

    df_dict = {}
    df_dict["ca"] = df1
    df_dict["dih"] = df2

    f_comb = FeatureSelector([('ca', f_ca), ('dih', f_dih)])
    f3 = f_comb.transform([trajectories[rnd_traj]])
    df3 = pd.DataFrame(f_comb.describe_features(trajectories[rnd_traj]))
    assert len(df3) == len(df1) + len(df2)
    df4 = pd.concat([df_dict[i] for i in f_comb.feat_list])
    # lets randomly compare 40 features
    for i in np.random.choice(range(len(df3)), 40):
        for j in df3.columns:
            assert eq(df3.iloc[i][j], df4.iloc[i][j])
예제 #11
0
def featurize_trajectories(coords, featurizer):
    if featurizer == 'RMSDFeaturizer':
        from msmbuilder.featurizer import RMSDFeaturizer
        feat = RMSDFeaturizer(reference_traj=coords[0])
    elif featurizer == 'DRIDFeaturizer':
        from msmbuilder.featurizer import DRIDFeaturizer
        feat = DRIDFeaturizer()
    elif featurizer == 'ContactFeaturizer':
        from msmbuilder.featurizer import ContactFeaturizer
        feat = ContactFeaturizer(scheme='ca')
    elif featurizer == 'DihedralFeaturizer':
        from msmbuilder.featurizer import DihedralFeaturizer
        feat = DihedralFeaturizer(types=['phi', 'psi'])
    return feat.fit_transform(coords)
예제 #12
0
def test_soft_min_contact_featurizer():
    # just get one frame for now
    traj = MinimalFsPeptide().get_cached().trajectories[0][0]
    soft_min_beta = 20

    ri, rj = np.random.choice(np.arange(traj.top.n_residues),
                              size=2,
                              replace=False)
    aind_i = [i.index for i in traj.top.residue(ri).atoms]
    aind_j = [i.index for i in traj.top.residue(rj).atoms]

    atom_pairs = [i for i in itertools.product(aind_i, aind_j)]

    featuizer = ContactFeaturizer(contacts=[[ri, rj]],
                                  scheme='closest',
                                  soft_min=True,
                                  soft_min_beta=soft_min_beta)

    features = featuizer.transform(([traj]))[0]
    distances = md.compute_distances(traj, atom_pairs)
    distances = soft_min_beta / \
        np.log(np.sum(np.exp(soft_min_beta / distances), axis=1))

    np.allclose(features, distances)
예제 #13
0
def Get_combined_features_villin():                                         
  from msmbuilder.featurizer import DihedralFeaturizer
  from msmbuilder.featurizer import ContactFeaturizer                            
  diheds= DihedralFeaturizer()
  contacts=ContactFeaturizer()
  features=[("di_villin",diheds),("con_villin",contacts)]
  import os
  import shutil
  os.chdir('/homes/anuginueni/traj_villin')
  if(os.path.isdir('/homes/anuginueni/traj_villin/combined')):
   shutil.rmtree('/homes/anuginueni/traj_villin/combined')
  from msmbuilder.dataset import dataset
  xyz = dataset( "/homes/anuginueni/traj_villin/*.xtc",topology='/homes/anuginueni/traj_villin/filtered.pdb',stride=5)
  from msmbuilder.feature_selection import FeatureSelector

  comb_features=FeatureSelector(features)
  co=xyz.fit_transform_with(comb_features, '/homes/anuginueni/traj_villin/combined/', fmt='dir-npy')
  return co
def featurize_trajectories(coords, featurizer):
    '''
    Input
    coords : list of 'MDTrajDataset' object

    Output 
    features : list of arrays, length n_trajs, each of shape (n_samples, n_features)
    '''
    if featurizer == 'RMSDFeaturizer':
        from msmbuilder.featurizer import RMSDFeaturizer
        feat = RMSDFeaturizer(reference_traj=coords[0])
    elif featurizer == 'DRIDFeaturizer':
        from msmbuilder.featurizer import DRIDFeaturizer
        feat = DRIDFeaturizer()
    elif featurizer == 'ContactFeaturizer':
        from msmbuilder.featurizer import ContactFeaturizer
        feat = ContactFeaturizer(scheme='ca')
    elif featurizer == 'DihedralFeaturizer':
        from msmbuilder.featurizer import DihedralFeaturizer
        feat = DihedralFeaturizer(types=['phi', 'psi'])
    return feat.fit_transform(coords)
def test_contacts():
    trajectories = MinimalFsPeptide().get_cached().trajectories
    contactfeaturizer = ContactFeaturizer()
    contacts = contactfeaturizer.transform(trajectories)

    assert contacts[0].shape[1] == 171
def calculate_contact_mat(traj, scheme):
    from msmbuilder.featurizer import ContactFeaturizer
    return ContactFeaturizer(scheme=scheme).partial_transform(traj)
예제 #17
0
def main():
    import argparse, textwrap
    parser = argparse.ArgumentParser(
        usage=textwrap.dedent(
            '''Use "python %(prog)s -h" for more information.'''),
        formatter_class=argparse.RawTextHelpFormatter,
        description=textwrap.dedent('''\
            First, this program employs msmbuilder to featurize given pdb trajectories into vectorizable space.
	    Second, the vector space is decompose by tICA or PCA to further reduce the dimension. 
            Third, clustering is performed so that each structure in the trajectories is labeled by an index. 
	    Forth, Marcov State Model, albeit may not be well behaved, is built on the labeled trajectories.
	    Last, FAST reward scores are calculated based on the transition-count matrix and user-chosen physical traits. 
        
            Example:
            $ python FAST.py path_to_pdb_trajectories/ --featurizer=DRIDFeaturizer --decomposer=PCA --decomposer-n-components=5 --clusterer=KCenters --n-clusters=5 --msm-prior-counts=0.2 --physical-trait=target-RMSD --target-pdb=/path_to_target_pdb/target.pdb '''
                                    ))
    parser.add_argument(
        'pdbpath',
        help=textwrap.dedent('''[required] Path to pdb trajectories.'''))
    parser.add_argument(
        '--lag-time',
        default=1,
        type=int,
        help=textwrap.dedent('''Lag time of the model. Default value = 1.'''))
    parser.add_argument('--featurizer',
                        default=None,
                        type=str,
                        help=textwrap.dedent('''\
            Featurizer at your choice. Available featurizers are (select them by name): 
            (1) RMSDFeaturizer;
            (2) DihedralFeaturizer, only phi and psi angles;
            (3) DRIDFeaturizer (DRID, Distribution of Reciprocal of Interatomic Distances);
            (4) ContactFeaturizer, CA contact. 	
            Note: user must choose a featurization method. Choose by name. ''')
                        )
    parser.add_argument('--decomposer',
                        default=None,
                        type=str,
                        help=textwrap.dedent('''\
            Decomposer at your choice. Available decomposers are: 
            (1) tICA;
            (2) PCA. 
            Note: selection of decomposer is not necessary but recommended.
            If not provided, program will ignore this step and cluster directly on raw features. '''
                                             ))
    parser.add_argument(
        '--decomposer-n-components',
        default=None,
        type=int,
        help=textwrap.dedent(
            '''Number of components to keep. if n_components is not set all components are kept.'''
        ))
    parser.add_argument('--clusterer',
                        default=None,
                        type=str,
                        help=textwrap.dedent('''\
            Clustering method at your choice. Available clusterer are: 
            (1) KMeans;
            (2) KCenters;
            (3) KMedoids;
            (4) MiniBatchKMeans;
            (5) MiniBatchKMedoids.
            Note: user must choose a clusering method. '''))
    parser.add_argument(
        '--n-clusters',
        default=5,
        type=int,
        help=textwrap.dedent(
            '''The number of clusters to form as well as the number of centroids to generate.'''
        ))
    parser.add_argument('--msm-n-timescales',
                        default=None,
                        type=int,
                        help=textwrap.dedent('''\
	    The number of dynamical timescales to calculate when diagonalizing the transition matrix. 
	    If not specified, it will compute n_states - 1. '''))
    parser.add_argument('--msm-prior-counts',
                        default=0,
                        type=float,
                        help=textwrap.dedent('''\
	    Add a number of 'pseudo counts' to each entry in the counts matrix after ergodic trimming. 
	    When prior_counts == 0 (default), the assigned transition probability between two states 
	    with no observed transitions will be zero, whereas when prior_counts > 0, even this unobserved 
	    transitions will be given nonzero probability. '''))
    parser.add_argument('--physical-trait',
                        default=None,
                        type=str,
                        help=textwrap.dedent('''\
            Physical trait used in calculation of FAST reward score. Available choices are: 
            (1) target-RMSD, if chosen, user must supply a target structure; 
            (2) target-native-contact, if chosen, user must supply a target structure; 
            (3) target-tmscore, if chosen, user must supply the data file containing the TM-scores in column;
            (4) potential, target free, if chosen, user must supply the data file containing the potentials in column; 
            Note: user must choose a physical trait. '''))
    parser.add_argument('--target-pdb',
                        default=None,
                        type=str,
                        help=textwrap.dedent('''\
            The target pdb structure. 
            Note: The target pdb should have the same number of atoms in structure with that in pdb trajectories. '''
                                             ))
    parser.add_argument('--initial-pdb',
                        default=None,
                        type=str,
                        help=textwrap.dedent('''\
            The initial pdb structure. 
            Note: The initial pdb should have the same number of atoms in structure with that in pdb trajectories. '''
                                             ))
    parser.add_argument(
        '--potential',
        default=None,
        type=str,
        help=textwrap.dedent(
            '''The potential file corresponding to the pdb trajectories. '''))
    parser.add_argument(
        '--tmscore',
        default=None,
        type=str,
        help=textwrap.dedent(
            '''The TM-score file corresponding to the pdb trajectories. '''))
    parser.add_argument(
        '--fast-n-simulations',
        default=30,
        type=int,
        help=textwrap.dedent(
            '''Number of parallel simulations in each round of FAST algorithm. Default value: 30. '''
        ))
    parser.add_argument(
        '--fast-alpha',
        default=1.,
        type=float,
        help=textwrap.dedent('''Number of clusters. Default value: 1.0.'''))
    parser.add_argument('--output',
                        type=str,
                        default='output',
                        help=textwrap.dedent('''Output file name.'''))
    args = parser.parse_args()

    from msmbuilder.dataset import dataset
    coords = dataset(os.path.join(args.pdbpath, '*.pdb'))
    print '%i trajectories found. \n' % len(coords)

    ## featurize
    features = featurize_trajectories(coords, args.featurizer)
    print "%s selected" % args.featurizer
    print "features: (n_samples, n_features) = (%i, %i) for each trajectory \n" % (
        features[0].shape[0], features[0].shape[1])

    ## decompose
    if args.decomposer == None:
        print "No decomposer is selected! Program will directly cluster the raw features. \n"
    else:
        features = decompose_features(
            features,
            args.decomposer,
            n_components=args.decomposer_n_components,
            lag_time=args.lag_time)
        print "%s selected" % args.decomposer
        print "features: (n_samples, n_features) = (%i, %i) for each trajectory \n" % (
            features[0].shape[0], features[0].shape[1])

    ## clustering
    clst = cluster_features(features,
                            args.clusterer,
                            n_clusters=args.n_clusters)
    cci = find_cluster_center_indices(features, clst)
    print "%s selected" % args.clusterer
    print "Cluster center indices: %s \n" % cci

    ## build msm
    #msm = build_msm(clst.labels_, lag_time=args.lag_time, n_timescales=args.msm_n_timescales, prior_counts=args.msm_prior_counts)
    #print msm, '\n'
    #print "Transition count matrix: \n %s \n" % msm.countsmat_
    #print "Relative population of each state: %s \n" % msm.populations_

    ## construct transition count matrix
    transition_count_mat = calc_transition_count_mat(
        np.concatenate(clst.labels_), args.n_clusters)
    print 'Transition count matrix: \n', transition_count_mat

    #### calculate FAST reward score
    output_df = pd.DataFrame()
    output_df['idx'] = cci
    output_df['#cluster'] = transition_count_mat.diagonal()

    if args.initial_pdb != None:
        import mdtraj as md
        initial = md.load(args.initial_pdb)

        from msmbuilder.featurizer import RMSDFeaturizer
        rmsd_to_initial = np.concatenate(
            RMSDFeaturizer(initial).fit_transform(coords))[:, 0]

        output_df['iniRMSD'] = rmsd_to_initial[cci]

    if args.target_pdb != None:
        import mdtraj as md
        target = md.load(args.target_pdb)

        from msmbuilder.featurizer import RMSDFeaturizer
        rmsd_to_target = np.concatenate(
            RMSDFeaturizer(target).fit_transform(coords))[:, 0]

        native_contact_dists, native_contact_pairs = md.compute_contacts(
            target, scheme='ca')
        native_contact_pairs = native_contact_pairs[np.where(
            native_contact_dists[0] <= 0.75)]
        print "Target structure has %i pairs of CA-CA contact in total. \n" % len(
            native_contact_pairs)

        from msmbuilder.featurizer import ContactFeaturizer
        native_contact_to_target = np.concatenate(
            ContactFeaturizer(
                contacts=native_contact_pairs,
                scheme='ca').fit_transform(coords))  # (n_samples, n_pairs)
        native_contact_to_target = np.select([
            native_contact_to_target <= 0.75, native_contact_to_target > 0.75
        ], [1, 0])
        native_contact_to_target = np.sum(native_contact_to_target, axis=1)

        output_df['tarRMSD'] = rmsd_to_target[cci]
        output_df['#NativeContact'] = native_contact_to_target[cci]

    if args.potential != None:
        potential = np.loadtxt(args.potential)
        output_df['potential'] = potential[cci]

    if args.tmscore != None:
        tmscore = np.loadtxt(args.tmscore)
        output_df['tmscore'] = tmscore[cci]

    # choose physical trait
    print "%s is selected in FAST \n" % args.physical_trait
    if args.physical_trait == 'target-RMSD':
        if args.target_pdb == None:
            print "User must provide a target structure! \n"
        rewards, sims, c = calc_FAST_reward_score(
            rmsd_to_target,
            cci,
            transition_count_mat,
            alpha=args.fast_alpha,
            n_simulations=args.fast_n_simulations,
            minmax='min')

    elif args.physical_trait == 'target-native-contact':
        if args.target_pdb == None:
            print "User must provide a target structure! \n"
        rewards, sims, c = calc_FAST_reward_score(
            native_contact_to_target,
            cci,
            transition_count_mat,
            alpha=args.fast_alpha,
            n_simulations=args.fast_n_simulations,
            minmax='max')

    elif args.physical_trait == 'target-tmscore':
        if args.tmscore == None:
            print "User must provide a TM-score file corresponding to the pdb trajectories! \n"
        rewards, sims, c = calc_FAST_reward_score(
            tmscore,
            cci,
            transition_count_mat,
            alpha=args.fast_alpha,
            n_simulations=args.fast_n_simulations,
            minmax='max')

    elif args.physical_trait == 'potential':
        if args.potential == None:
            print "User must provide a potential file corresponding to the pdb trajectories! \n"
        rewards, sims, c = calc_FAST_reward_score(
            potential,
            cci,
            transition_count_mat,
            alpha=args.fast_alpha,
            n_simulations=args.fast_n_simulations,
            minmax='min')

    output_df['#Transition'] = c
    output_df['reward'] = rewards
    output_df['#sim'] = sims

    ## output
    with open(args.output + '.CenterIdx_ClusterSize.dat', 'w') as f:
        for i in range(args.n_clusters):
            print >> f, '%6i %6i' % (cci[i], sims[i])

    if args.initial_pdb != None:
        with open(args.output + '.iniRMSD.dat', 'w') as f:
            for ele in rmsd_to_initial:
                print >> f, '%8.3f' % ele

    if args.target_pdb != None:
        with open(args.output + '.tarRMSD.dat', 'w') as f:
            for ele in rmsd_to_target:
                print >> f, '%8.3f' % ele

        with open(args.output + '.tarNativeContact.dat', 'w') as f:
            for ele in native_contact_to_target:
                print >> f, '%8.3f' % ele

    with open(args.output + '.dat', 'w') as f:
        print >> f, output_df

    ## plot
    if args.target_pdb != None:
        plot_cluster(X=rmsd_to_target,
                     Y=native_contact_to_target,
                     cluster_center_indices=cci,
                     figname=args.output + '.tarRMSD_tarNativeContact.png',
                     x_label='RMSD to target / nm',
                     y_label='# native contact',
                     xmin=0,
                     xmax=ceil(rmsd_to_target.max(), 0),
                     ymin=0,
                     ymax=ceil(native_contact_to_target.max()),
                     c_map='winter',
                     cc_color='red')
        if args.initial_pdb != None:
            plot_cluster(X=rmsd_to_initial,
                         Y=rmsd_to_target,
                         cluster_center_indices=cci,
                         figname=args.output + '.tarRMSD_iniRMSD.png',
                         x_label='RMSD to initial / nm',
                         y_label='RMSD to target / nm',
                         xmin=0,
                         xmax=ceil(rmsd_to_target.max(), 0),
                         ymin=0,
                         ymax=ceil(rmsd_to_initial.max(), 0),
                         c_map='winter',
                         cc_color='red')
        if args.tmscore != None:
            plot_cluster(X=tmscore,
                         Y=native_contact_to_target,
                         cluster_center_indices=cci,
                         figname=args.output + '.tmscore_tarNativeContact.png',
                         x_label='TM-score to target',
                         y_label='# native contact',
                         xmin=0,
                         xmax=1,
                         ymin=0,
                         ymax=ceil(native_contact_to_target.max()),
                         c_map='winter',
                         cc_color='red')
            if args.potential != None:
                plot_cluster(X=tmscore,
                             Y=potential,
                             cluster_center_indices=cci,
                             figname=args.output + '.tmscore_potential.png',
                             x_label='TM-score to target',
                             y_label='potential',
                             xmin=0,
                             xmax=1,
                             ymin=floor(potential.min()),
                             ymax=ceil(potential.max()),
                             c_map='winter',
                             cc_color='red')
        if args.potential != None:
            plot_cluster(X=rmsd_to_target,
                         Y=potential,
                         cluster_center_indices=cci,
                         figname=args.output + '.tarRMSD_potential.png',
                         x_label='RMSD to target / nm',
                         y_label='potential',
                         xmin=0,
                         xmax=ceil(rmsd_to_target.max(), 0),
                         ymin=floor(potential.min()),
                         ymax=ceil(potential.max()),
                         c_map='winter',
                         cc_color='red')

    if args.decomposer == 'tICA':
        cat_features = np.concatenate(features)
        plot_cluster(X=cat_features[:, 0],
                     Y=cat_features[:, 1],
                     cluster_center_indices=cci,
                     figname=args.output + '.tICA_1st_2nd.png',
                     x_label='tIC 1',
                     y_label='tIC 2',
                     xmin=floor(cat_features[:, 0].min()),
                     xmax=ceil(cat_features[:, 0].max()),
                     ymin=floor(cat_features[:, 1].min()),
                     ymax=ceil(cat_features[:, 1].max()),
                     c_map='winter',
                     cc_color='red')
    elif args.decomposer == 'PCA':
        cat_features = np.concatenate(features)
        plot_cluster(X=cat_features[:, 0],
                     Y=cat_features[:, 1],
                     cluster_center_indices=cci,
                     figname=args.output + '.PCA_1st_2nd.png',
                     x_label='PC 1',
                     y_label='PC 2',
                     xmin=floor(cat_features[:, 0].min()),
                     xmax=ceil(cat_features[:, 0].max()),
                     ymin=floor(cat_features[:, 1].min()),
                     ymax=ceil(cat_features[:, 1].max()),
                     c_map='winter',
                     cc_color='red')
예제 #18
0
def create_equivalent_contact_featurizer(yaml_file, alignment_file,
                                         protein_list=None,
                                         pairs=None,
                                         same_residue=True,
                                         transform=None,
                                         **kwargs):
    """
    Create a equivalent contacts featurizer for a set of proteins
    :param yaml_file: yaml file location
    :param alignment_file: alignment file location
    :param pairs: wanted sequence index positions in the alignment
    You need to just figure out the wanted location for one residue.
    _map_residue_ind_seq_ind function can help with this
    :same residue: True is you would restrict to having the same residue at the same
    sequence position.
    :param kwargs: kwargs for the contact featurizer
    :return: dictionary of contact featurizers. one for each protein
    """
    featurizer_dict={}

    #load alignment file
    yaml_file = load_yaml_file(yaml_file)
    alignment_file = _parse_alignment_file(alignment_file)
    if protein_list is None:
        protein_list = yaml_file["protein_list"]

    if pairs is None:
        #use the max length(probably a horrible idea)
        max_seq_len = max([len(alignment_file[i]) for i in alignment_file.keys()])
        pairs = [i for i in itertools.combinations(range(max_seq_len), 2)]

    for protein in protein_list:
        print(protein)
        #get a list of residues we can keep
        can_keep=[]
        #get mapping and seq
        prt_mapping, prt_seq = _map_residue_ind_seq_ind(yaml_file, protein,
                                                        alignment_file[protein])
        #for wanted positions in the massive wanted indices list
        inv_map = {v: k for k, v in prt_mapping.items()}

        for position in np.unique(pairs):
            #get the
            #get the possible codes at every position
            possible_codes = set([alignment_file[p][position] for p in alignment_file.keys()])
            #if there is not a missing residue

            if not "-" in possible_codes:
                if same_residue and len(set(possible_codes))!=1:
                    continue
                # get the inverse mapping and add it to the list of can keep
                residue_index = inv_map[position]
                can_keep.append(residue_index)
        #sort it because i dont want random bs issues.
        can_keep = np.sort(can_keep)
        #get its pairs
        actual_pairs = np.array([i for i in itertools.combinations(can_keep, 2) if i in pairs])
        if transform=='logistic':
            featurizer_dict[protein] = LogisticContactFeaturizer(contacts=actual_pairs, **kwargs)
        elif transform=='binary':
            featurizer_dict[protein] = BinaryContactFeaturizer(contacts=actual_pairs, **kwargs)
        elif transform is None or transform=="none":
            featurizer_dict[protein] = ContactFeaturizer(contacts=actual_pairs, **kwargs)
        else:
            raise ValueError("type needs to be one of logistic, binary, none")
    return featurizer_dict
예제 #19
0
def test_contacts():
    trajectories = MinimalFsPeptide().get_cached().trajectories
    contactfeaturizer = ContactFeaturizer()
    contacts = contactfeaturizer.transform(trajectories)

    assert contacts[0].shape[1] == 171