Пример #1
0
 def make_cv_dataset(self):
     'create list of subtraining/validation by repeated cv of training data'
     ## This function only works for the objects defined in the submodels_module.py which is specified above as LIST_A objects.
     ## Depending on the object the self.training_df dataframe class object the default training data is either the seq_to_assay_train_1,8,10.pkl file or the assay_to_dot_training_data.pkl file
     ## Similarly the self.num_cv_splits and the self.num_cv_repeats integer class objects are either 10 or 3 depending on the LIST_A object
     ## local_df using the sub_sample() function from load_format_data.py returns a dataframe of randomly selected data a fraction of the traininf_df data entered
     local_df = load_format_data.sub_sample(self.training_df,
                                            self.sample_fraction,
                                            self.sample_seed)
     ## RepeatedKFold splits data into test and train for cross validation. As mentioned above depending on the object calling this function.
     ## the number of flods are either 100 or 30 or 9.
     kf = RepeatedKFold(n_splits=self.num_cv_splits,
                        n_repeats=self.num_cv_repeats)
     train, validate = [], []
     ## an array of the same length as the above local_df filled with zeros is created and then indices are generated to split into train and test data
     for train_index, test_index in kf.split(np.zeros(len(local_df))):
         train.append(local_df.iloc[train_index])
         validate.append(local_df.iloc[test_index])
         ## The data corresponding to the train and test indicies generated in local_df are placed in the array.
         ## The train and validate lists will both be the same length and each contain arrays with dataframe objects in them.
         ## The arrays that have the same indice in the train and validate lists are complementary training and testing data.
     ## This creates a new class variable self.data_pairs which is a zip object which is tuple iterator, it takes two iteratable objects
     ## and pairs object in the same indice in one tuple.
     ## self.data_pairs is a tuple containing mutiple tuples each of length 2. The 2 elements of the inner tuples are the test and train complementary data
     self.data_pairs = zip(train, validate)
Пример #2
0
 def make_test_dataset(self):
     'create list of full training set/test set for repeated model performance evaluation'
     local_df=load_format_data.sub_sample(self.training_df,self.sample_fraction)
     train,test=[],[]
     for i in range(self.num_test_repeats):
         train.append(local_df)
         test.append(self.testing_df)
     self.data_pairs=zip(train,test)
Пример #3
0
 def make_cv_dataset(self):
     'create list of subtraining/validation by repeated cv of training data'
     local_df=load_format_data.sub_sample(self.training_df,self.sample_fraction)
     kf=RepeatedKFold(n_splits=self.num_cv_splits,n_repeats=self.num_cv_repeats)
     train,validate=[],[]
     for train_index, test_index in kf.split(np.zeros(len(local_df))):
         train.append(local_df.iloc[train_index])
         validate.append(local_df.iloc[test_index])
     self.data_pairs=zip(train,validate)
Пример #4
0
 def make_test_dataset(self):
     'create list of full training set/test set for repeated model performance evaluation'
     ## This function only works for the objects defined in the submodels_module.py which is specified above as LIST_A objects.
     ## Depending on the object the self.training_df dataframe class object the default training data is either the seq_to_assay_train_1,8,10.pkl file or the assay_to_dot_training_data.pkl file
     ## Similarly the self.testing_df dataframe class object is an attribute for LIST_A objects. The defaults dataframe for self.testing_df is the seq_to_dot_test.pkl and assay_to_data_training.pkl file
     ## local_df using the sub_sample() function from load_format_data.py returns a dataframe of randomly selected data a fraction of the training_df data entered
     local_df = load_format_data.sub_sample(self.training_df,
                                            self.sample_fraction,
                                            self.sample_seed)
     train, test = [], []
     ## Depending on object the self.num_test_repeats is an integer attribute defaulted to either 10 or 1.
     for i in range(self.num_test_repeats):
         train.append(local_df)
         test.append(self.testing_df)
         ## The training and the testing data are appended to the train and test list respectively
     ## self.data_pairs is a tuple containing mutiple tuples each of length 2. The 2 elements of the inner tuples are the test and train complementary data
     self.data_pairs = zip(train, test)