def main(): #creates a list of all pairwise combinations of the 10 assays a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] combin_list = [] for i in range(1, 11): combin_list_temp = combinations(a, i) for j in combin_list_temp: combin_list.append(j) # the toggle no uses the supercomputers job array ID # to determine which assay combinaiton and architecture to train. toggle_no = int(sys.argv[1]) #determine the model architecture b_models = ['ridge', 'forest', 'svm', 'fnn'] if toggle_no < 10000: arch = b_models[0] elif toggle_no < 20000: arch = b_models[1] toggle_no = toggle_no - 10000 elif toggle_no < 30000: arch = b_models[2] toggle_no = toggle_no - 20000 elif toggle_no < 40000: arch = b_models[3] toggle_no = toggle_no - 30000 b = modelbank.assay_to_yield_model(combin_list[toggle_no], arch, 1) b.cross_validate_model() b.test_model() b.plot() b.save_predictions()
def assay_to_yield_best_arch(self): self.compare_test = False self.get_control = self.get_assay_control a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] combin_list = [] for i in range(1, 11): combin_list_temp = combinations(a, i) for j in combin_list_temp: combin_list.append(j) b_models = ['ridge', 'forest', 'svm', 'fnn'] # b_models=b_models[0:3] combin_list = combin_list[0:10] best_model_per_combin = [] for combin in combin_list: model_list = [] for arch in b_models: model_list.append( modelbank.assay_to_yield_model(combin, arch, 1)) best_model_per_combin.append(self.get_best_model(model_list)) best_model = self.get_best_model(best_model_per_combin, save=True, plot='assay_to_yield_best_arch') print(best_model.model_name)
def main(): a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] combin_list = [] for i in range(1, 11): combin_list_temp = combinations(a, i) for j in combin_list_temp: combin_list.append(j) toggle_no = int(sys.argv[1]) # if toggle_no<12: ### seq_to_yield model using measured yields # a_models=['ridge','forest','svm','fnn','emb_fnn_maxpool','emb_fnn_flat','emb_rnn','emb_cnn','small_emb_rnn','small_emb_atn_rnn','small_emb_cnn','small_emb_atn_cnn'] # a=modelbank.seq_to_yield_model(a_models[toggle_no],1) # a=modelbank.final_seq_to_yield_model(a_models[toggle_no],sample_size) # a.cross_validate_model() # a.test_model() # a.plot() # ### assay_to_yield_model b_models = ['ridge', 'forest', 'svm', 'fnn'] if toggle_no < 10000: arch = b_models[0] elif toggle_no < 20000: arch = b_models[1] toggle_no = toggle_no - 10000 elif toggle_no < 30000: arch = b_models[2] toggle_no = toggle_no - 20000 elif toggle_no < 40000: arch = b_models[3] toggle_no = toggle_no - 30000 b = modelbank.assay_to_yield_model(combin_list[toggle_no], arch, 1) b.cross_validate_model()
def assay_to_yield_best_arch(self): self.compare_test=False self.get_control=self.get_assay_control ## The class variables compare_test is set to false while the get_control variable is linked to the get_assay_control function a=[1,2,3,4,5,6,7,8,9,10] combin_list=[] ## a function is created to map wach assay for i in range(1,11): combin_list_temp=combinations(a,i) for j in combin_list_temp: combin_list.append(j) ## Each possible combination of the 10 assays are created, combinations ranging from only 1 assay to combination with all 10. b_models=['ridge','forest','svm','fnn'] # b_models=b_models[0:3] combin_list=combin_list[0:10] best_model_per_combin=[] for combin in combin_list: model_list=[] for arch in b_models: model_list.append(modelbank.assay_to_yield_model(combin,arch,1)) best_model_per_combin.append(self.get_best_model(model_list)) ## For each combination of assays, the combination with the particular regression model is stored in the best_model_per_combin list best_model=self.get_best_model(best_model_per_combin,save=True,plot='assay_to_yield_best_arch') ## Then the best model is selected form the ones compiled in the best_model_per_combin list using the get_best_model() function ## then the model is displayed as a print statement. print(best_model.model_name)
def main(): ''' compare test performances when reducing training sample size. This version is for first paper, predicting yield from assays and one-hot encoded sequence. ''' a = int(sys.argv[1]) if a < 4: b = 0 elif a < 8: a = a - 4 b = 1 elif a < 12: a = a - 8 b = 2 elif a == 12: b = 3 a = a - 12 else: print('incorrect toggle number') arch_list = ['ridge', 'svm', 'forest', 'fnn'] # size_list=[0.055,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1] size_list = [0.7, 0.8, 0.9, 1] for size in size_list: if b == 0: mdl = modelbank.seqandassay_to_yield_model([1, 8, 10], arch_list[a], size) elif b == 1: #1,5,9,12 mdl = modelbank.assay_to_yield_model([1, 8, 10], arch_list[a], size) elif b == 2: mdl = modelbank.seq_to_yield_model(arch_list[a], size) elif b == 3: mdl = modelbank.control_to_yield_model(arch_list[a], size) for seed in range(9): #no seed is seed=42 mdl.change_sample_seed(seed) mdl.cross_validate_model() mdl.limit_test_set([1, 8, 10]) mdl.test_model()
def main(): ## a is a list of integers from 1-10 to represent each assay score. a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] combin_list = [] for i in range(1, 11): combin_list_temp = combinations(a, i) for j in combin_list_temp: combin_list.append(j) ## Then each possible combination of the 10 different assays. is created and stored in an array format and then added to the empty combin_list. # the toggle no uses the supercomputers job array ID # to determine which assay combinaiton and architecture to train. toggle_no = int(sys.argv[1]) ## sys.argv[1] is the command line inputs and. this takes the input after the file name and assigns it to the toggle_no ## Depending on the magnitube of the toggle_no entered the regression model is decided. #determine the model architecture b_models = ['ridge', 'forest', 'svm', 'fnn'] if toggle_no < 10000: arch = b_models[0] ## If the toggle_no is less than 10000 then a ridge regression is run elif toggle_no < 20000: arch = b_models[1] toggle_no = toggle_no - 10000 ## If the toggle_no is less than 20000 then a random forest regression is run and the toggle_no is decreased by 10000 elif toggle_no < 30000: arch = b_models[2] toggle_no = toggle_no - 20000 ## If the toggle_no is less than 30000 then a support vector regression is run and the toggle_no is decreased by 20000 elif toggle_no < 40000: arch = b_models[3] toggle_no = toggle_no - 30000 ## If the toggle_no is less than 40000 then a feedforward neural network is run and the toggle_no is decreased by 30000 b = modelbank.assay_to_yield_model(combin_list[toggle_no], arch, 1) ## Then the assay_to_yield_model object in the submodels_module.py script is accessed and the object is ## instantiated with the model architecture determined in the above if-elif-else statements and the list in toggle_no index of combin_list ## along with a sample fraction of 1. b.cross_validate_model() b.test_model() ## The cross_validate_models and test_model functions of the parent model class is run, these functions determine the hyperparamters ## and then using the hyperparamters train the dataset and make a test prediction b.plot() b.save_predictions()
mdl.cross_validate_model() mdl.limit_test_set([1, 8, 10]) mdl.test_model() # if __name__ == '__main__': # main() loss_per_model, std_per_model = [], [] arch_list = ['ridge', 'svm', 'forest', 'fnn'] for i in range(4): cv_loss, test_loss, test_std = np.inf, np.inf, 0 for arch in arch_list: if i == 0: mdl = modelbank.assay_to_yield_model([1, 8, 10], arch, 1) elif i == 1: mdl = modelbank.weighted_assay_to_yield_model([1, 8, 10], arch, 1) elif i == 2: mdl = modelbank.seqandassay_to_yield_model([1, 8, 10], arch, 1) else: mdl = modelbank.seqandweightedassay_to_yield_model([1, 8, 10], arch, 1) if mdl.model_stats['cv_avg_loss'] < cv_loss: cv_loss = mdl.model_stats['cv_avg_loss'] test_loss = mdl.model_stats['test_avg_loss'] test_std = mdl.model_stats['test_std_loss'] loss_per_model.append(test_loss) std_per_model.append(test_std) seq_model = modelbank.seq_to_yield_model('forest', 1)
min_test_loss = np.mean(cur_test_loss) min_test_std = np.std(cur_test_loss) c_mdl_test_loss.append(min_test_loss) c_mdl_test_std.append(min_test_std) oh_test_loss = [] oh_model = modelbank.seq_to_yield_model('forest', 1) oh_test_loss.append(oh_model.model_stats['test_avg_loss']) for i in range(9): oh_model.change_sample_seed(i) oh_test_loss.append(oh_model.model_stats['test_avg_loss']) oh_test_std = np.std(oh_test_loss) oh_test_loss = np.mean(oh_test_loss) assay_test_loss = [] assay_model = modelbank.assay_to_yield_model([1, 8, 10], 'forest', 1) assay_test_loss.append(assay_model.model_stats['test_avg_loss']) for i in range(9): assay_model.change_sample_seed(i) assay_test_loss.append(assay_model.model_stats['test_avg_loss']) assay_test_std = np.std(assay_test_loss) assay_test_loss = np.mean(assay_test_loss) control_model = modelbank.control_to_yield_model('ridge', 1) control_loss = control_model.model_stats['test_avg_loss'] control_model.limit_test_set([1, 8, 10]) exploded_df, _, _ = load_format_data.explode_yield(control_model.testing_df) exp_var = np.average(np.square(np.array(exploded_df['y_std']))) fig, ax = plt.subplots(1, 1, figsize=[2.5, 2.5], dpi=300) x = [-1, len(c_models)]
# Each assay combination + architecture was cross-validated to train hyper parameters, then tested on a left-out test set # Below shows the example of training the models shown in Figure 3d of Golinski et. al 2020. ### import submodels_module as modelbank #define model parameters #assays are numbered in order as found in SI table # #model architectures for predicting yield are: 'ridge','forest','svm','fnn' assay_mdl_param = { 'assays': [1, 8, 10], 'model_architecture': 'forest', 'sample_fraction': 1 } #initialize model based upon model parameters mdl = modelbank.assay_to_yield_model(**assay_mdl_param) ### # Other model options # # one-hot sequence to yield model # seq_to_yield_param={'model_architecture':'forest', 'sample_fraction':1} # mdl=modelbank.seq_to_yield_model(**seq_to_yield_param) # assays and sequence model # uses same params as assay model # mdl=modelbank.seqandassay_to_yield_model(**assay_mdl_param) # strain only control model # strain_only_param={'model_architecture':'ridge', 'sample_fraction':1}}
def main(): ''' compare test performances when reducing training sample size. This version is for first paper, predicting yield from assays and one-hot encoded sequence. ''' ## A command line input is required when running this program. The integer input ## should be between 0-12. a=int(sys.argv[1]) if a<4: b=0 ## if the input is less than 4 then b value is set to 0 elif a<8: a=a-4 b=1 ## if a is between 4-8 then the b value is set to 1 and a is reduced by 4 elif a<12: a=a-8 b=2 ## if a is between 8-12 then the b value is set to 2 and a is reduced by 8 elif a==12: b=3 a=a-12 ## if a is equal to 12 then the b value is set to 3 and a is set to 0. else: print('incorrect toggle number') ## If the inout is out of bounds then an error message is printed. arch_list=['ridge','svm','forest','fnn'] ## A string list is created containing the names of the different regression models and stored as arch_list # size_list=[0.055,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1] size_list=[0.7,0.8,0.9,1] ## A float list is created containing varying amounts of sample fractions and stored as size_list for size in size_list: ## each element in the size_list array, we check the value of the b value created in the above if-else ## statements and this dictates the kind of submodel_module.py object created ## if b = 0, then a seqandassay_to_yield_model object is created with an assay list of [1,8,10] ## a regression model dictated by the 'a' index of the arch_list and the size determined by the iteration of size_list if b==0: mdl=modelbank.seqandassay_to_yield_model([1,8,10],arch_list[a],size) ## if b = 1, then a assay_to_yield_model object is created with an assay list of [1,8,10] ## a regression model dictated by the 'a' index of the arch_list and the size determined by the iteration of size_list elif b==1: #1,5,9,12 mdl=modelbank.assay_to_yield_model([1,8,10],arch_list[a],size) ## if b = 2, then a seq_to_yield_model object is created with a regression model dictated by ## the 'a' index of the arch_list and the size determined by the iteration of size_list elif b==2: mdl=modelbank.seq_to_yield_model(arch_list[a],size) ## if b = 3, then a control_to_yield_model object is created with a regression model dictated by ## the 'a' index of the arch_list and the size determined by the iteration of size_list elif b==3: mdl=modelbank.control_to_yield_model(arch_list[a],size) for seed in range(9): #no seed is seed=42 ## For each element in the int range [0,9). The sample_seed class int to the element ## Then the trial data, model data and plots are updated to reflect the new sample_seed size mdl.change_sample_seed(seed) ## Then the best hyperparameters for the given model and seed size is determined using the cross_validate_model() ## function from the model object mdl.cross_validate_model() ## Following this limit_test_set() function defined in the x_to_yield_model parent class to update the ## testing_df class dataframe to reflect the 1,8,10 assays. mdl.limit_test_set([1,8,10]) ## Finally using the test_model() function from the model parent class is run to ## train the model using the hyperparameters defined above and the training data to predict the testing dataset. mdl.test_model()