예제 #1
0
def main():
    #creates a list of all pairwise combinations of the 10 assays
    a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    combin_list = []
    for i in range(1, 11):
        combin_list_temp = combinations(a, i)
        for j in combin_list_temp:
            combin_list.append(j)

    # the toggle no uses the supercomputers job array ID # to determine which assay combinaiton and architecture to train.
    toggle_no = int(sys.argv[1])

    #determine the model architecture
    b_models = ['ridge', 'forest', 'svm', 'fnn']
    if toggle_no < 10000:
        arch = b_models[0]
    elif toggle_no < 20000:
        arch = b_models[1]
        toggle_no = toggle_no - 10000
    elif toggle_no < 30000:
        arch = b_models[2]
        toggle_no = toggle_no - 20000
    elif toggle_no < 40000:
        arch = b_models[3]
        toggle_no = toggle_no - 30000

    b = modelbank.assay_to_yield_model(combin_list[toggle_no], arch, 1)
    b.cross_validate_model()
    b.test_model()
    b.plot()
    b.save_predictions()
예제 #2
0
    def assay_to_yield_best_arch(self):
        self.compare_test = False
        self.get_control = self.get_assay_control

        a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        combin_list = []
        for i in range(1, 11):
            combin_list_temp = combinations(a, i)
            for j in combin_list_temp:
                combin_list.append(j)

        b_models = ['ridge', 'forest', 'svm', 'fnn']
        # b_models=b_models[0:3]

        combin_list = combin_list[0:10]
        best_model_per_combin = []
        for combin in combin_list:
            model_list = []
            for arch in b_models:
                model_list.append(
                    modelbank.assay_to_yield_model(combin, arch, 1))
            best_model_per_combin.append(self.get_best_model(model_list))

        best_model = self.get_best_model(best_model_per_combin,
                                         save=True,
                                         plot='assay_to_yield_best_arch')
        print(best_model.model_name)
def main():
    a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    combin_list = []
    for i in range(1, 11):
        combin_list_temp = combinations(a, i)
        for j in combin_list_temp:
            combin_list.append(j)

    toggle_no = int(sys.argv[1])

    # if toggle_no<12:
    ### seq_to_yield model using measured yields
    # a_models=['ridge','forest','svm','fnn','emb_fnn_maxpool','emb_fnn_flat','emb_rnn','emb_cnn','small_emb_rnn','small_emb_atn_rnn','small_emb_cnn','small_emb_atn_cnn']
    # a=modelbank.seq_to_yield_model(a_models[toggle_no],1)
    # a=modelbank.final_seq_to_yield_model(a_models[toggle_no],sample_size)
    # a.cross_validate_model()
    # a.test_model()
    # a.plot()

    # ### assay_to_yield_model
    b_models = ['ridge', 'forest', 'svm', 'fnn']
    if toggle_no < 10000:
        arch = b_models[0]
    elif toggle_no < 20000:
        arch = b_models[1]
        toggle_no = toggle_no - 10000
    elif toggle_no < 30000:
        arch = b_models[2]
        toggle_no = toggle_no - 20000
    elif toggle_no < 40000:
        arch = b_models[3]
        toggle_no = toggle_no - 30000

    b = modelbank.assay_to_yield_model(combin_list[toggle_no], arch, 1)
    b.cross_validate_model()
예제 #4
0
 def assay_to_yield_best_arch(self):
     self.compare_test=False
     self.get_control=self.get_assay_control
     ## The class variables compare_test is set to false while the get_control variable is linked to the get_assay_control function
     a=[1,2,3,4,5,6,7,8,9,10]
     combin_list=[]
     ## a function is created to map wach assay
     for i in range(1,11):
         combin_list_temp=combinations(a,i)
         for j in combin_list_temp:
             combin_list.append(j)
     ## Each possible combination of the 10 assays are created, combinations ranging from only 1 assay to combination with all 10.
     b_models=['ridge','forest','svm','fnn']        
     # b_models=b_models[0:3]
     combin_list=combin_list[0:10]
     best_model_per_combin=[]
     for combin in combin_list:
         model_list=[]
         for arch in b_models:
             model_list.append(modelbank.assay_to_yield_model(combin,arch,1))
         best_model_per_combin.append(self.get_best_model(model_list)) 
     ## For each combination of assays, the combination with the particular regression model is stored in the best_model_per_combin list
     best_model=self.get_best_model(best_model_per_combin,save=True,plot='assay_to_yield_best_arch')
     ## Then the best model is selected form the ones compiled in the best_model_per_combin list using the get_best_model() function
     ## then the model is displayed as a print statement. 
     print(best_model.model_name)
예제 #5
0
def main():
    '''
    compare test performances when reducing training sample size. This version is for first paper, predicting yield from assays and one-hot encoded sequence. 
    '''

    a = int(sys.argv[1])
    if a < 4:
        b = 0
    elif a < 8:
        a = a - 4
        b = 1
    elif a < 12:
        a = a - 8
        b = 2
    elif a == 12:
        b = 3
        a = a - 12
    else:
        print('incorrect toggle number')

    arch_list = ['ridge', 'svm', 'forest', 'fnn']

    # size_list=[0.055,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
    size_list = [0.7, 0.8, 0.9, 1]

    for size in size_list:
        if b == 0:
            mdl = modelbank.seqandassay_to_yield_model([1, 8, 10],
                                                       arch_list[a], size)
        elif b == 1:  #1,5,9,12
            mdl = modelbank.assay_to_yield_model([1, 8, 10], arch_list[a],
                                                 size)
        elif b == 2:
            mdl = modelbank.seq_to_yield_model(arch_list[a], size)
        elif b == 3:
            mdl = modelbank.control_to_yield_model(arch_list[a], size)

        for seed in range(9):  #no seed is seed=42
            mdl.change_sample_seed(seed)
            mdl.cross_validate_model()
            mdl.limit_test_set([1, 8, 10])
            mdl.test_model()
예제 #6
0
def main():
    ## a is a list of integers from 1-10 to represent each assay score.
    a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    combin_list = []
    for i in range(1, 11):
        combin_list_temp = combinations(a, i)
        for j in combin_list_temp:
            combin_list.append(j)
    ## Then each possible combination of the 10 different assays. is created and stored in an array format and then added to the empty combin_list.
    # the toggle no uses the supercomputers job array ID # to determine which assay combinaiton and architecture to train.
    toggle_no = int(sys.argv[1])
    ## sys.argv[1] is the command line inputs and. this takes the input after the file name and assigns it to the toggle_no
    ## Depending on the magnitube of the toggle_no entered the regression model is decided.
    #determine the model architecture
    b_models = ['ridge', 'forest', 'svm', 'fnn']
    if toggle_no < 10000:
        arch = b_models[0]
        ## If the toggle_no is less than 10000 then a ridge regression is run
    elif toggle_no < 20000:
        arch = b_models[1]
        toggle_no = toggle_no - 10000
        ## If the toggle_no is less than 20000 then a random forest regression is run and the toggle_no is decreased by 10000
    elif toggle_no < 30000:
        arch = b_models[2]
        toggle_no = toggle_no - 20000
        ## If the toggle_no is less than 30000 then a support vector regression is run and the toggle_no is decreased by 20000
    elif toggle_no < 40000:
        arch = b_models[3]
        toggle_no = toggle_no - 30000
        ## If the toggle_no is less than 40000 then a feedforward neural network is run and the toggle_no is decreased by 30000

    b = modelbank.assay_to_yield_model(combin_list[toggle_no], arch, 1)
    ## Then the assay_to_yield_model object in the submodels_module.py script is accessed and the object is
    ## instantiated with the model architecture determined in the above if-elif-else statements and the list in toggle_no index of combin_list
    ## along with a sample fraction of 1.
    b.cross_validate_model()
    b.test_model()
    ## The cross_validate_models and test_model functions of the parent model class is run, these functions determine the hyperparamters
    ## and then using the hyperparamters train the dataset and make a test prediction
    b.plot()
    b.save_predictions()
예제 #7
0
    mdl.cross_validate_model()
    mdl.limit_test_set([1, 8, 10])
    mdl.test_model()


# if __name__ == '__main__':
#     main()

loss_per_model, std_per_model = [], []
arch_list = ['ridge', 'svm', 'forest', 'fnn']

for i in range(4):
    cv_loss, test_loss, test_std = np.inf, np.inf, 0
    for arch in arch_list:
        if i == 0:
            mdl = modelbank.assay_to_yield_model([1, 8, 10], arch, 1)
        elif i == 1:
            mdl = modelbank.weighted_assay_to_yield_model([1, 8, 10], arch, 1)
        elif i == 2:
            mdl = modelbank.seqandassay_to_yield_model([1, 8, 10], arch, 1)
        else:
            mdl = modelbank.seqandweightedassay_to_yield_model([1, 8, 10],
                                                               arch, 1)
        if mdl.model_stats['cv_avg_loss'] < cv_loss:
            cv_loss = mdl.model_stats['cv_avg_loss']
            test_loss = mdl.model_stats['test_avg_loss']
            test_std = mdl.model_stats['test_std_loss']
    loss_per_model.append(test_loss)
    std_per_model.append(test_std)

seq_model = modelbank.seq_to_yield_model('forest', 1)
예제 #8
0
            min_test_loss = np.mean(cur_test_loss)
            min_test_std = np.std(cur_test_loss)
    c_mdl_test_loss.append(min_test_loss)
    c_mdl_test_std.append(min_test_std)

oh_test_loss = []
oh_model = modelbank.seq_to_yield_model('forest', 1)
oh_test_loss.append(oh_model.model_stats['test_avg_loss'])
for i in range(9):
    oh_model.change_sample_seed(i)
    oh_test_loss.append(oh_model.model_stats['test_avg_loss'])
oh_test_std = np.std(oh_test_loss)
oh_test_loss = np.mean(oh_test_loss)

assay_test_loss = []
assay_model = modelbank.assay_to_yield_model([1, 8, 10], 'forest', 1)
assay_test_loss.append(assay_model.model_stats['test_avg_loss'])
for i in range(9):
    assay_model.change_sample_seed(i)
    assay_test_loss.append(assay_model.model_stats['test_avg_loss'])
assay_test_std = np.std(assay_test_loss)
assay_test_loss = np.mean(assay_test_loss)

control_model = modelbank.control_to_yield_model('ridge', 1)
control_loss = control_model.model_stats['test_avg_loss']
control_model.limit_test_set([1, 8, 10])
exploded_df, _, _ = load_format_data.explode_yield(control_model.testing_df)
exp_var = np.average(np.square(np.array(exploded_df['y_std'])))

fig, ax = plt.subplots(1, 1, figsize=[2.5, 2.5], dpi=300)
x = [-1, len(c_models)]
예제 #9
0
# Each assay combination + architecture was cross-validated to train hyper parameters, then tested on a left-out test set
# Below shows the example of training the models shown in Figure 3d of Golinski et. al 2020.
###

import submodels_module as modelbank

#define model parameters
#assays are numbered in order as found in SI table #
#model architectures for predicting yield are: 'ridge','forest','svm','fnn'
assay_mdl_param = {
    'assays': [1, 8, 10],
    'model_architecture': 'forest',
    'sample_fraction': 1
}
#initialize model based upon model parameters
mdl = modelbank.assay_to_yield_model(**assay_mdl_param)

###
# Other model options
#
# one-hot sequence to yield model
# seq_to_yield_param={'model_architecture':'forest', 'sample_fraction':1}
# mdl=modelbank.seq_to_yield_model(**seq_to_yield_param)

# assays and sequence model
# uses same params as assay model
# mdl=modelbank.seqandassay_to_yield_model(**assay_mdl_param)

# strain only control model
# strain_only_param={'model_architecture':'ridge', 'sample_fraction':1}}
예제 #10
0
def main():
    '''
    compare test performances when reducing training sample size. This version is for first paper, predicting yield from assays and one-hot encoded sequence. 
    '''
    ## A command line input is required when running this program. The integer input
    ## should be between 0-12.
    a=int(sys.argv[1])
    if a<4:
        b=0
        ## if the input is less than 4 then b value is set to 0
    elif a<8:
        a=a-4
        b=1
        ## if a is between 4-8 then the b value is set to 1 and a is reduced by 4
    elif a<12:
        a=a-8
        b=2
        ## if a is between 8-12 then the b value is set to 2 and a is reduced by 8
    elif a==12:
        b=3
        a=a-12
        ## if a is equal to 12 then the b value is set to 3 and a is set to 0. 
    else:
        print('incorrect toggle number')
        ## If the inout is out of bounds then an error message is printed. 
    arch_list=['ridge','svm','forest','fnn']
    ## A string list is created containing the names of the different regression models and stored as arch_list
    # size_list=[0.055,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
    size_list=[0.7,0.8,0.9,1]
    ## A float list is created containing varying amounts of sample fractions and stored as size_list
    for size in size_list:
        ## each element in the size_list array, we check the value of the b value created in the above if-else
        ## statements and this dictates the kind of submodel_module.py object created
        ## if b = 0, then a seqandassay_to_yield_model object is created with an assay list of [1,8,10]
        ## a regression model dictated by the 'a' index of the arch_list and the size determined by the iteration of size_list
        if b==0:
            mdl=modelbank.seqandassay_to_yield_model([1,8,10],arch_list[a],size)
        ## if b = 1, then a assay_to_yield_model object is created with an assay list of [1,8,10]
        ## a regression model dictated by the 'a' index of the arch_list and the size determined by the iteration of size_list
        elif b==1: #1,5,9,12
            mdl=modelbank.assay_to_yield_model([1,8,10],arch_list[a],size)
        ## if b = 2, then a seq_to_yield_model object is created with a regression model dictated by
        ## the 'a' index of the arch_list and the size determined by the iteration of size_list
        elif b==2: 
            mdl=modelbank.seq_to_yield_model(arch_list[a],size)
        ## if b = 3, then a control_to_yield_model object is created with a regression model dictated by
        ## the 'a' index of the arch_list and the size determined by the iteration of size_list
        elif b==3:
            mdl=modelbank.control_to_yield_model(arch_list[a],size)
            
        for seed in range(9): #no seed is seed=42
            ## For each element in the int range [0,9). The sample_seed class int to the element
            ## Then the trial data, model data and plots are updated to reflect the new sample_seed size
            mdl.change_sample_seed(seed)
            ## Then the best hyperparameters for the given model and seed size is determined using the cross_validate_model()
            ## function from the model object 
            mdl.cross_validate_model()
            ## Following this limit_test_set() function defined in the x_to_yield_model parent class to update the
            ## testing_df class dataframe to reflect the 1,8,10 assays.
            mdl.limit_test_set([1,8,10])
            ## Finally using the test_model() function from the model parent class  is run to
            ## train the model using the hyperparameters defined above and the training data to predict the testing dataset.
            mdl.test_model()