示例#1
0
def test_robustscaler_vs_sklearn():
    # Compare msmbuilder.preprocessing.RobustScaler
    # with sklearn.preprocessing.RobustScaler

    robustscalerr = RobustScalerR()
    robustscalerr.fit(np.concatenate(trajs))

    robustscaler = RobustScaler()
    robustscaler.fit(trajs)

    y_ref1 = robustscalerr.transform(trajs[0])
    y1 = robustscaler.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
def test_robustscaler_vs_sklearn():
    # Compare msmbuilder.preprocessing.RobustScaler
    # with sklearn.preprocessing.RobustScaler

    robustscalerr = RobustScalerR()
    robustscalerr.fit(np.concatenate(trajs))

    robustscaler = RobustScaler()
    robustscaler.fit(trajs)

    y_ref1 = robustscalerr.transform(trajs[0])
    y1 = robustscaler.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
示例#3
0
    def scale_data(self, scaler='Robust'):

        print('Scale featurized data been called\n')
        print('-------------------------------\n')
        from msmbuilder.preprocessing import RobustScaler

        if scaler == 'Robust':
            scaler = RobustScaler()

        self.scaled_data = scaler.fit_transform(self.sim_seqs)

        print('scaled ', self.scaled_data[0].shape)
        # #
        print("Scaling feautirized data successfully")
        print('-----------------------------------\n')
示例#4
0
 def setUp(self):
     numpy.random.seed(12)
     self.top = 'data_app/runs/structure.prmtop'
     self.traj_1 = 'data_app/runs/run-000.nc'
     self.traj_2 = 'data_app/runs/run-001.nc'
     self.feat = DihedralFeaturizer()
     self.traj_dict = {
         0: load(self.traj_1, top=self.top),
         1: load(self.traj_2, top=self.top)
     }
     self.scaler = RobustScaler()
     self.tica = tICA(n_components=2)
     self.ftrajs = {
         0: numpy.random.rand(100, 50),
         1: numpy.random.rand(100, 50),
     }
示例#5
0
    def build_model(self, user_defined_model):
        """
        Load or build a model (Pipeline from scikit-learn) to do all the transforming and fitting
        :param user_defined_model: Either a string (to load from disk) or a Pipeline object to use as model
        :return model: Return the model back
        """
        if user_defined_model is None:
            if os.path.exists(self.model_pkl_fname):
                logger.info('Loading model pkl file {}'.format(
                    self.model_pkl_fname))
                model = load_generic(self.model_pkl_fname)
            else:
                logger.info('Building default model based on dihedrals')

                # build a lag time of 1 ns for tICA and msm
                # if the stride is too big and we can't do that
                # use 1 frame and report how much that is in ns
                if self.app.meta is not None:
                    lag_time = max(1, int(1 / self.timestep))
                    logger.info(
                        'Using a lag time of {} ns for the tICA and MSM'.
                        format(lag_time * self.timestep))
                else:
                    self.timestep = None
                    lag_time = 1
                    logger.warning(
                        'Cannot determine timestep. Defaulting to 1 frame.'.
                        format(lag_time))
                model = Pipeline([('feat', DihedralFeaturizer()),
                                  ('scaler', RobustScaler()),
                                  ('tICA',
                                   tICA(lag_time=lag_time,
                                        commute_mapping=True,
                                        n_components=10)),
                                  ('clusterer',
                                   MiniBatchKMeans(n_clusters=200)),
                                  ('msm',
                                   MarkovStateModel(lag_time=lag_time,
                                                    ergodic_cutoff='off',
                                                    reversible_type=None))])
        else:
            if not isinstance(user_defined_model, Pipeline):
                raise ValueError(
                    'model is not an sklearn.pipeline.Pipeline object')
            else:
                logger.info('Using user defined model')
                model = user_defined_model
        return model
#
# TIMESCALES
#
# The data will be loaded with a stride of 10 frames.  Each fame is 50ps, so the time per frame will be
# 500ps/frame or 0.5ns/frame.
# Each trajectory is 1000 frames long
# Lag time will be 40 frames (20 ns)  based on a visual inspection of /Misc/MSM_lag_time.ipynb
to_ns = 0.5
msm_lag = int(40 / to_ns)

#
# FEATURE INDICES
#
all_idx = np.load('indices_all.npy')

#
# OTHER PARAMETERS
#
ref_traj = md.load('../Data/data/trajectory-1.xtc',
                   top='../Data/data/fs-peptide.pdb')

featurizer = FeatureSelector(features=feats)

pipe = Pipeline([('features', featurizer),
                 ('variance_cut', VarianceThreshold()),
                 ('scaling', RobustScaler()), ('cluster', MiniBatchKMeans()),
                 ('msm', MarkovStateModel(lag_time=msm_lag, verbose=False))])

save_generic(pipe, 'model.pickl')
示例#7
0
        # memory at once. The dataset object lazily-loads trajectories as they are needed.
        # Below, we create a dataset out of the many *.xtc files we downloaded. We only load
        # every 10th frame
        xyz = dataset("./*.xtc", topology='./%s' % args.pdb)

        # The raw (x, y, z) coordinates from the simulation do not respect the translational
        # and rotational symmetry of our problem. A Featurizer transforms cartesian
        # coordinates into other representations. Here we use the DihedralFeaturizer to turn
        # our data into phi and psi dihedral angles.
        featurizer = DihedralFeaturizer(types=['phi', 'psi'])
        diheds = xyz.fit_transform_with(featurizer, 'diheds/', fmt='dir-npy')

        # Since the range of values in our raw data can vary widely from feature to feature,
        # we can scale values to reduce bias. Here we use the RobustScaler to center and
        # scale our dihedral angles by their respective interquartile ranges.
        scaler = RobustScaler()
        scaled_diheds = diheds.fit_transform_with(scaler,
                                                  'scaled_diheds/',
                                                  fmt='dir-npy')

        # Intermediate kinetic model: tICA
        # tICA is similar to principal component analysis
        tica_model = tICA(lag_time=int(args.lag),
                          n_components=int(args.components))
        # fit and transform can be done in seperate steps:
        tica_model = scaled_diheds.fit_with(tica_model)
        tica_trajs = scaled_diheds.transform_with(tica_model,
                                                  'ticas/',
                                                  fmt='dir-npy')

        # Conformations need to be clustered into states (sometimes written as microstates).
示例#8
0
from msmbuilder.cluster import MiniBatchKMeans
from msmbuilder.msm import MarkovStateModel
from msmbuilder.io import save_generic
from sklearn.base import clone, BaseEstimator
from six import iteritems

# The data will be loaded with a stride of 10 frames.  Each fame is 50ps, so the time per frame will be
# 500ps/frame or 0.5ns/frame.
# Each trajectory is 1000 frames long
# Lag time will be 40 (20 ns) frames based on a visual inspection of /Misc/MSM_lag_time.ipynb
to_ns = 0.5
msm_lag = int(40 / to_ns)

# [‘phi’, ‘psi’, ‘omega’, ‘chi1’, ‘chi2’, ‘chi3’, ‘chi4’]

feats = [('backbone_dihed', DihedralFeaturizer(types=['phi', 'psi'])),
         ('residues_dihed',
          DihedralFeaturizer(types=['chi1', 'chi2', 'chi3', 'chi4'])),
         ('contacts', ContactFeaturizer())]

featurizer = FeatureSelector(features=feats)

pipe = Pipeline([('features', featurizer),
                 ('variance_cut', VarianceThreshold()),
                 ('scaling', RobustScaler()),
                 ('tica', tICA(kinetic_mapping=True)),
                 ('cluster', MiniBatchKMeans()),
                 ('msm', MarkovStateModel(lag_time=msm_lag, verbose=False))])

save_generic(pipe, 'model.pickl')
示例#9
0
def calculate_fitness(population_dihedral, diheds, score_global, i, lock):
    import pandas as pd
    import numpy as np
    pop_index = i
    new_diheds = []

    for i in range(0, len(diheds)):
        X = diheds[i]
        selected_features = X[:, population_dihedral]
        new_diheds.append(selected_features)
    from msmbuilder.preprocessing import RobustScaler
    scaler = RobustScaler()
    scaled_diheds = scaler.fit_transform(new_diheds)
    scaled_diheds = new_diheds
    from msmbuilder.decomposition import tICA
    tica_model = tICA(lag_time=2, n_components=5)
    tica_model.fit(scaled_diheds)
    tica_trajs = tica_model.transform(scaled_diheds)
    from msmbuilder.cluster import MiniBatchKMeans
    clusterer = MiniBatchKMeans(n_clusters=200, random_state=42)

    clustered_trajs = clusterer.fit_transform(tica_trajs)
    from msmbuilder.msm import MarkovStateModel
    msm = MarkovStateModel(lag_time=50, n_timescales=5)
    #msm.fit_transform(clustered_trajs)
    from sklearn.cross_validation import KFold
    n_states = [4]
    cv = KFold(len(clustered_trajs), n_folds=5)
    results = []
    for n in n_states:
        msm.n_states_ = n
        for fold, (train_index, test_index) in enumerate(cv):
            train_data = [clustered_trajs[i] for i in train_index]
            test_data = [clustered_trajs[i] for i in test_index]
            msm.fit(train_data)
            train_score = msm.score(train_data)
            test_score = msm.score(test_data)
            time_score = msm.timescales_[0]
            time_test_score = time_score + test_score
            print(time_score)
            print(test_score)
            av_score = time_test_score / 2
            results.append({
                'train_score': train_score,
                'test_score': test_score,
                'time_score': time_score,
                'av_score': av_score,
                'n_states': n,
                'fold': fold
            })
            print(msm.timescales_)
    results = pd.DataFrame(results)
    avgs = (results.groupby('n_states').aggregate(np.median).drop('fold',
                                                                  axis=1))
    best_nt = avgs['test_score'].idxmax()
    best_n = avgs['av_score'].idxmax()
    best_score = avgs.loc[best_n, 'av_score']
    best_scorent = avgs.loc[best_nt, 'test_score']
    print(best_scorent)
    lock.acquire()
    score_global.update({pop_index: best_scorent})
    lock.release()
def main():
    cli = argparse.ArgumentParser()
    cli.add_argument('-e', '--eps', help='eps', default=1, type=float)
    cli.add_argument('-m',
                     '--min_samples',
                     help='min_samples',
                     default=5,
                     type=int)
    cli.add_argument('-l', '--nlist', help='nlist', default=1000, type=int)
    cli.add_argument('-p', '--nprobe', help='nprob', default=10, type=int)

    # Download example dataset
    from msmbuilder.example_datasets import AlanineDipeptide
    ala2 = AlanineDipeptide(verbose=False)
    xyz = ala2.get().trajectories
    print(ala2.description())

    #xyz = [t[::10] for t in xyz]
    print("{} trajectories".format(len(xyz)))
    # msmbuilder does not keep track of units! You must keep track of your
    # data's timestep
    to_ns = 0.5
    print("with length {} ns".format(set(len(x) * to_ns for x in xyz)))

    from msmbuilder.featurizer import DihedralFeaturizer
    featurizer = DihedralFeaturizer(types=['phi', 'psi'])
    diheds = featurizer.fit_transform(xyz)

    print(xyz[0].xyz.shape)
    print(diheds[0].shape)

    from msmbuilder.preprocessing import RobustScaler
    scaler = RobustScaler()
    scaled_diheds = scaler.fit_transform(diheds)

    print(diheds[0].shape)
    print(scaled_diheds[0].shape)

    from msmbuilder.decomposition import tICA
    tica_model = tICA(lag_time=2, n_components=2)
    # fit and transform can be done in seperate steps:
    tica_model.fit(diheds)

    tica_trajs = tica_model.transform(diheds)
    featurizer = DihedralFeaturizer(types=['phi', 'psi'], sincos=False)
    diheds = featurizer.fit_transform(xyz)
    print(diheds[0].shape)
    print(tica_trajs[0].shape)

    # ===========================================================================
    #if os.path.isfile("./phi_angles.txt") and os.path.isfile("./psi_angles.txt") is True:
    #    phi_angles = np.loadtxt("./phi_angles.txt", dtype=np.float32)
    #    psi_angles = np.loadtxt("./psi_angles.txt", dtype=np.float32)
    #X = np.column_stack((phi_angles, psi_angles))
    #print(X.shape)
    phi_angles = np.degrees(diheds[0][:, 0])
    psi_angles = np.degrees(diheds[0][:, 1])
    print(phi_angles)
    X = tica_trajs[0].astype(np.float32)
    #rint(X)
    n_size = X.shape[0]
    dimension = X.shape[1]

    #print(X.shape)

    # ===========================================================================
    args = cli.parse_args()
    eps = args.eps  # eps
    min_samples = args.min_samples  # min_samples
    nlist = args.nlist
    nprobe = args.nprobe
    IVFFlat = True
    print('n_size = %d,\t dimension = %d,\t eps = %f, min_samples = %d' %
          (n_size, dimension, eps, min_samples))

    n_samples = 1000
    percent = 0.9
    import random
    whole_samples = random.sample(list(X), n_samples)
    #print whole_samples
    from metrics.pairwise import pairwise_distances
    sample_dist_metric = pairwise_distances(whole_samples,
                                            whole_samples,
                                            metric='l2')
    print(sample_dist_metric.shape)
    sample_dist = []
    for i in range(0, n_samples):
        for j in range(i + 1, n_samples):
            sample_dist.append(sample_dist_metric[i, j])
    sorted_sample_dist = np.sort(sample_dist)
    print("Len of samples:", len(sorted_sample_dist),
          np.max(sorted_sample_dist), np.min(sorted_sample_dist))

    eps_list = []
    len_samples = len(sorted_sample_dist)
    for percent in [0.30, 0.20, 0.10]:  #,0.005, 0.003,
        #                   0.002, 0.001, 0.0008, 0.0005, 0.0003, 0.0002, 0.0001, 0.00005]:
        #percent /= 10.0
        index = int(round(len_samples * percent))
        if index == len_samples:
            index -= 1
        dc = sorted_sample_dist[index]
        #print index, sorted_sample_dist[index]
        eps_list.append(dc)
    print(eps_list)

    #print X
    # ===========================================================================
    # do Clustering using MR -DBSCAN method
    clustering_name = "mr-dbscan_iter_"
    #potential = True
    remove_outliers = False
    potential = False
    eps = eps_list[0]
    min_samples = 1
    len_frames = len(X)
    print("Total frames:", len_frames)
    print("Running first calculation")
    db = Faiss_DBSCAN(eps=eps,
                      min_samples=min_samples,
                      nlist=nlist,
                      nprobe=nprobe,
                      metric="l2",
                      GPU=False,
                      IVFFlat=IVFFlat)
    db.fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    old_assignments = db.labels_
    n_microstates = len(
        set(old_assignments)) - (1 if -1 in old_assignments else 0)
    print('Estimated number of clusters: %d' % n_microstates)

    # Calculating percentage of each states
    frame_bincount = np.bincount(
        old_assignments[old_assignments >= 0])  #remove outliers
    frame_freq_index_sorted = np.argsort(
        frame_bincount)[::-1]  # descending arg sort
    frame_freq_percent_sorted = frame_bincount[
        frame_freq_index_sorted] / np.float32(len_frames)
    print(frame_freq_percent_sorted[0:10])
    print(frame_freq_index_sorted[0:10])
    old_frame_freq_percent_sorted = frame_freq_percent_sorted
    old_frame_freq_index_sorted = frame_freq_index_sorted
    n_microstates = len(
        set(old_assignments)) - (1 if -1 in old_assignments else 0)
    print('Estimated number of clusters: %d' % n_microstates)
    iter_name = clustering_name + '0' + '_eps_' + str(
        eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str(
            n_microstates)
    plot_cluster(labels=old_assignments,
                 phi_angles=phi_angles,
                 psi_angles=psi_angles,
                 name=iter_name,
                 potential=potential)

    n_iterations = len(eps_list)
    print("n_iterations:", n_iterations)
    min_samples_list = [50, 30, 10]
    #min_samples_list = [50, 30, 20, 15, 10, 8, 5, 2]
    n_min_samples = len(min_samples_list)
    #eps_list = [3.0, 2.0, 1.0, 0.8, 0.5]
    #min_samples_list = [3, 3, 3, 3, 3, 2, 2]

    results = np.zeros((n_min_samples, n_iterations, len_frames),
                       dtype=np.int32)
    for i in range(1, n_iterations):
        eps = eps_list[i]
        min_samples = min_samples_list[i]
        db = Faiss_DBSCAN(eps=eps,
                          min_samples=min_samples,
                          nlist=nlist,
                          nprobe=nprobe,
                          metric="l2",
                          GPU=False,
                          IVFFlat=IVFFlat).fit(X)

        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        new_assignments = db.labels_
        if i is n_iterations - 1:
            remove_outliers = True
        #else:
        #    remove_outliers = False
        assignments = merge_assignments(new_assignments,
                                        old_assignments,
                                        remove_outliers=remove_outliers)
        n_microstates = len(set(assignments)) - (1 if -1 in assignments else 0)

        #results[j,i, :]= np.array(assignments)
        print("Iter:", i, "Running MR-DBSCAN at eps:", eps, 'min_sampes:',
              min_samples, 'Estimated number of clusters:', n_microstates)
        #print('Estimated number of clusters: %d' % n_microstates)
        iter_name = clustering_name + str(i) + '_eps_' + str(
            eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str(
                n_microstates)
        plot_cluster(labels=assignments,
                     phi_angles=phi_angles,
                     psi_angles=psi_angles,
                     name=iter_name,
                     potential=potential)
        #old_assignments = assignments
    #print(results)
    #np.save("results.npy", results)
    #np.savetxt("results.csv", results, fmt="%d", delimiter=",")
    np.savetxt("eps_list.txt", eps_list, fmt="%f", delimiter=",")
    np.savetxt("min_samples_list.txt",
               min_samples_list,
               fmt="%d",
               delimiter=",")
示例#11
0

if __name__ == "__main__":

    trajectory_dir = '/Volumes/REA_Data/AADH/traj_5_rxts'
    topology_file = '/Users/robert_arbon/Code/AADH/Analysis/MSM_Reactants_Only/2agy_rxt.psf'
    reference_file = '/Users/robert_arbon/Code/AADH/Analysis/MSM_Reactants_Only/2agy_rxt.pdb'
    reference_traj = md.load(reference_file)

    # Load the meta data
    meta = load_metadata(traj_dir=trajectory_dir, top=topology_file)

    # Featurize
    feature = RawPositionsFeaturizer(ref_traj=reference_traj)
    ftrajs = featurize(featurizer=feature, meta_data=meta)

    # Summarize
    variance = np.var(combine(ftrajs), axis=0)
    plot_features(variance,
                  name='Variance.png',
                  feature_name='Variance',
                  ordered=False)

    # Normalize
    scaler = RobustScaler()
    strajs = scaler.fit_transform(ftrajs)

    # perform tICA
    tica_obj = tICA(n_components=10, lag_time=10, kinetic_mapping=True)
    tica_traj = tica_obj.fit_transform(strajs)
示例#12
0
class TestUtils:
    def setUp(self):
        numpy.random.seed(12)
        self.top = 'data_app/runs/structure.prmtop'
        self.traj_1 = 'data_app/runs/run-000.nc'
        self.traj_2 = 'data_app/runs/run-001.nc'
        self.feat = DihedralFeaturizer()
        self.traj_dict = {
            0: load(self.traj_1, top=self.top),
            1: load(self.traj_2, top=self.top)
        }
        self.scaler = RobustScaler()
        self.tica = tICA(n_components=2)
        self.ftrajs = {
            0: numpy.random.rand(100, 50),
            1: numpy.random.rand(100, 50),
        }

    def test_get_ftrajs(self):
        output = get_ftrajs(self.traj_dict, self.feat)
        assert len(output) == 2
        assert type(output) == dict

    def test_get_sctrajs(self):
        self.scaler.fit(list(self.ftrajs.values()))
        output = get_sctrajs(self.ftrajs, self.scaler)
        assert len(output) == 2
        assert type(output) == dict

    def test_get_ttrajs(self):
        self.tica.fit(list(self.ftrajs.values()))
        output = get_ttrajs(self.ftrajs, self.tica)
        assert len(output) == 2
        assert type(output) == dict

    def test_traj_from_stateinds(self):
        traj = traj_from_stateinds(spawns, meta)
        assert traj.n_frames == 1

    def test_write_production_file(self):
        write_production_file()
        assert os.path.exists('Production.in')
        os.remove('Production.in')

    def test_write_cpptraj_script(self):
        write_cpptraj_script(self.traj_1, self.top)
        assert os.path.exists('script.cpptraj')
        os.remove('script.cpptraj')

    def test_write_tleap_script(self):
        write_tleap_script(write=True)
        assert os.path.exists('script.tleap')
        os.remove('script.tleap')

    def test_create_folder(self):
        fname = 'foo'
        create_folder(fname)
        assert os.path.isdir(fname)
        os.removedirs(fname)

    def test_create_symlinks(self):
        create_folder('src_symlinks')
        create_folder('dst_symlinks')
        with open('src_symlinks/1.txt', 'w') as f:
            f.writelines('foo')

        create_symlinks(files='src_symlinks/*.txt', dst_folder='dst_symlinks')
        assert os.path.exists('dst_symlinks/1.txt')
        rmtree('src_symlinks')
        rmtree('dst_symlinks')

    def test_hmr_prmtop(self):
        new_top = hmr_prmtop(self.top, save=False)
        assert isinstance(new_top, AmberParm)
示例#13
0
dump(f, "raw_featurizer.pkl")

#featurizer = DihedralFeaturizer(types=['chi1', 'chi2'], resids= 73,74,75,76,77,78,79,80,81,82,83)
diheds = featurizer.fit_transform(ds)
dump(diheds, "features.pkl")

#print(ds[0].shape)
print(diheds[0].shape)

# this basically maps every feature to atom indices.
df1 = pd.DataFrame(featurizer.describe_features(ds))
dump(df1, "feature_descriptor.pkl")

#Robust scaling
from msmbuilder.preprocessing import RobustScaler
scaler = RobustScaler()
scaled_diheds = scaler.fit_transform(diheds)

print(diheds[0].shape)
print(scaled_diheds[0].shape)

#Reducing dimension
tica_model = tICA(lag_time=1, n_components=10)
# fit and transform can be done in seperate steps:
tica_model.fit(diheds)
tica_trajs = tica_model.transform(diheds)

print(diheds[0].shape)
print(tica_trajs[0].shape)

#lets dump the tica mdl for future use
def main_modified(generations):
 import numpy as np
 from msmbuilder.preprocessing import RobustScaler
 import time
 import pickle
 import os
 import multiprocessing
 os.environ["OMP_NUM_THREADS"] = "1"
 import operator
 from multiprocessing import Pool
 from operator import itemgetter
 diheds=Get_dihedral_features_villin()
 scaler = RobustScaler()                                                           
 scaled_feature = scaler.fit_transform(diheds) 
 Val=Laplacian_score(scaled_feature) # output of imp_features and col_mean of the laplacian score of each  dihedral
 col_mean=Val[0]
 imp_features=Val[1]
 current_gen = 0
 for_each_gen_score =[]
 population_each_gen=[]
 population_dihedral=[]
 population_dihedral=initial_population(imp_features)
 cross_probability=0.8
 num_parents=(int)(cross_probability*len(population_dihedral))
 population_dihedral_duplicate=[]
 numberOfThreads = multiprocessing.cpu_count()
 f = open("benzamidine_diheds_ga_score"+str(generations)+".txt", "a")
 while current_gen < generations:
   manager = multiprocessing.Manager()
   score = manager.dict()
   processes = []
   lock = multiprocessing.Lock()
   for i in range(len(population_dihedral)):
         p = multiprocessing.Process(target=calculate_fitness, args=(population_dihedral[i],scaled_feature,score,i,lock))
         processes.append(p)
 #starttime = time.time()
   for i in chunks(processes,numberOfThreads): #chunks is a function : has to be defined
      p_count=0   
      for process in i:
         process.start() 
         p_count=p_count+1
      print("the started process are"+str(p_count))
      for process in i:

         process.join()
         p_count=p_count-1
      print("the joined process are"+str(p_count))
      for process in i:
         process.terminate()
         p_count=p_count+1
      print("the terminated process are"+str(p_count)) 
   scored_population={}
   scored_population=dict(sorted(score.items(), key=operator.itemgetter(1)))
   for_each_gen_score.append(scored_population)
   population_each_gen.append(population_dihedral)
   scored_population_list=list(scored_population.keys())
   parents=[]
   parents = select_parents_rank_based(scored_population,population_dihedral,cross_probability)
   offsprings_1=[]
   offsprings_1=crossover(parents,population_dihedral)
   
   parents_binary=[]
   parents_binary=parents_binarize(parents,imp_features)
   offsprings_2_binary=[]
   count_mutation=len(population_dihedral)-len(offsprings_1)
   offsprings_2_binary=mutation_binary_offspring(parents_binary,4,count_mutation)
#,col_mean,imp_features)
   offsprings_2=[]
   offsprings_2=binary_to_pop_dih(offsprings_2_binary)
   for i in range(len(offsprings_2)):
     offsprings_2[i]=np.asarray(offsprings_2[i])
   for i in range(len(offsprings_1)):
     offsprings_1[i]=np.asarray(offsprings_1[i])
   offsprings=[]
   offsprings=offsprings_1+offsprings_2
  # offsprings.append(population_dihedral[scored_population_list[len(scored_population_list)-1]])
   #offsprings.append(population_dihedral[scored_population_list[len(scored_population_list)-2]])
   population_dihedral=[]
   population_dihedral=offsprings
   current_gen = current_gen+1
 print(for_each_gen_score,file=f)
 f.close()
 return for_each_gen_score,population_each_gen,scaled_feature,imp_features  
示例#15
0
from msmbuilder.preprocessing import RobustScaler
import numpy as np
from msmbuilder.io import load_trajs, save_trajs, save_generic
import matplotlib
matplotlib.use('Agg')
from matplotlib.pylab import plt
from utilities import plot_box

if __name__ == '__main__':

    # Load
    feature_name = 'Positions'
    meta, feature_trajs = load_trajs('Unscaled-{}-ftraj'.format(feature_name))

    # Select scaler
    featurizer = RobustScaler()

    # Transform values
    featurizer.fit_transform(feature_trajs.values())
    scaled_trajs = {}
    for k, v in feature_trajs.items():
        scaled_trajs[k] = featurizer.partial_transform(v)

    # Plot unscaled features
    ftrajs = np.concatenate([fx[::100] for fx in scaled_trajs.values()])
    fig, ax = plt.subplots(figsize=(15, 5))
    plot_box(ax, fxx=ftrajs, feature_name='Scaled {}'.format(feature_name))
    fig.tight_layout()
    fig.savefig("Scaled-{}-box.pdf".format(feature_name))

    # Save