def make_cv_dataset(self): 'create list of subtraining/validation by repeated cv of training data' ## This function only works for the objects defined in the submodels_module.py which is specified above as LIST_A objects. ## Depending on the object the self.training_df dataframe class object the default training data is either the seq_to_assay_train_1,8,10.pkl file or the assay_to_dot_training_data.pkl file ## Similarly the self.num_cv_splits and the self.num_cv_repeats integer class objects are either 10 or 3 depending on the LIST_A object ## local_df using the sub_sample() function from load_format_data.py returns a dataframe of randomly selected data a fraction of the traininf_df data entered local_df = load_format_data.sub_sample(self.training_df, self.sample_fraction, self.sample_seed) ## RepeatedKFold splits data into test and train for cross validation. As mentioned above depending on the object calling this function. ## the number of flods are either 100 or 30 or 9. kf = RepeatedKFold(n_splits=self.num_cv_splits, n_repeats=self.num_cv_repeats) train, validate = [], [] ## an array of the same length as the above local_df filled with zeros is created and then indices are generated to split into train and test data for train_index, test_index in kf.split(np.zeros(len(local_df))): train.append(local_df.iloc[train_index]) validate.append(local_df.iloc[test_index]) ## The data corresponding to the train and test indicies generated in local_df are placed in the array. ## The train and validate lists will both be the same length and each contain arrays with dataframe objects in them. ## The arrays that have the same indice in the train and validate lists are complementary training and testing data. ## This creates a new class variable self.data_pairs which is a zip object which is tuple iterator, it takes two iteratable objects ## and pairs object in the same indice in one tuple. ## self.data_pairs is a tuple containing mutiple tuples each of length 2. The 2 elements of the inner tuples are the test and train complementary data self.data_pairs = zip(train, validate)
def make_test_dataset(self): 'create list of full training set/test set for repeated model performance evaluation' local_df=load_format_data.sub_sample(self.training_df,self.sample_fraction) train,test=[],[] for i in range(self.num_test_repeats): train.append(local_df) test.append(self.testing_df) self.data_pairs=zip(train,test)
def make_cv_dataset(self): 'create list of subtraining/validation by repeated cv of training data' local_df=load_format_data.sub_sample(self.training_df,self.sample_fraction) kf=RepeatedKFold(n_splits=self.num_cv_splits,n_repeats=self.num_cv_repeats) train,validate=[],[] for train_index, test_index in kf.split(np.zeros(len(local_df))): train.append(local_df.iloc[train_index]) validate.append(local_df.iloc[test_index]) self.data_pairs=zip(train,validate)
def make_test_dataset(self): 'create list of full training set/test set for repeated model performance evaluation' ## This function only works for the objects defined in the submodels_module.py which is specified above as LIST_A objects. ## Depending on the object the self.training_df dataframe class object the default training data is either the seq_to_assay_train_1,8,10.pkl file or the assay_to_dot_training_data.pkl file ## Similarly the self.testing_df dataframe class object is an attribute for LIST_A objects. The defaults dataframe for self.testing_df is the seq_to_dot_test.pkl and assay_to_data_training.pkl file ## local_df using the sub_sample() function from load_format_data.py returns a dataframe of randomly selected data a fraction of the training_df data entered local_df = load_format_data.sub_sample(self.training_df, self.sample_fraction, self.sample_seed) train, test = [], [] ## Depending on object the self.num_test_repeats is an integer attribute defaulted to either 10 or 1. for i in range(self.num_test_repeats): train.append(local_df) test.append(self.testing_df) ## The training and the testing data are appended to the train and test list respectively ## self.data_pairs is a tuple containing mutiple tuples each of length 2. The 2 elements of the inner tuples are the test and train complementary data self.data_pairs = zip(train, test)