Пример #1
0
def build_new_model():
    'transfers weight from old model to new model'
    # import tensorflow as tf
    s2a_params = [[1, 8, 10], 'emb_cnn', 1]
    s2a = mb.seq_to_assay_model(*s2a_params)
    s2a._model.set_model(s2a.get_best_trial()['hyperparam'],
                         xa_len=16,
                         cat_var_len=3,
                         lin_or_sig=s2a.lin_or_sig)
    s2a.load_model(0)
    s2e_model = s2a._model.get_seq_embeding_layer_model()

    space = s2a.get_best_trial()['hyperparam']
    filters = int(space['filters'])
    kernel_size = int(space['kernel_size'])
    input_drop = space['input_drop']
    emb_dim = int(space['AA_emb_dim'])

    new_s2e = tf.keras.Sequential()
    new_s2e.add(tf.keras.layers.Embedding(21, emb_dim, input_length=16))
    new_s2e.add(
        tf.keras.layers.Conv1D(filters=filters,
                               kernel_size=kernel_size,
                               activation='relu'))
    new_s2e.add(tf.keras.layers.GlobalMaxPool1D(name='seq_embedding'))
    new_s2e.build((None, 16))
    new_s2e.compile()
    new_s2e.set_weights(s2e_model.get_weights())
    new_s2e.save('best_emb_model')
Пример #2
0
def main():

    toggle_no = int(sys.argv[1])
    ## If this function is run, then the program must be run through the terminal, with an integer input. The integer input must be within
    ## the following range [0,4]. This integer input is stored in the variable toggle_no.
    # c_models=['ridge','fnn','emb_fnn_flat','emb_fnn_maxpool','emb_fnn_maxpool_linear','emb_rnn','small_emb_rnn','small_emb_atn_rnn','small_emb_rnn_linear',
    #     'emb_cnn','small_emb_cnn','small_emb_atn_cnn','small_emb_cnn_linear']
    c_models = [
        'ridge', 'fnn', 'emb_fnn_flat', 'small_emb_rnn_linear', 'emb_cnn'
    ]
    ## A string list, c_models is created which has different types of regression models in it.
    for ss in [0.01, 0.1, .5]:
        ## An interable object ss is created to iterate through the list containing different sample sizes.
        c = modelbank.seq_to_assay_model([1, 8, 10], c_models[toggle_no], ss)
        ## For each sample size, a seq_to_assay_model object defined in the submodels_module.py program is created. It is instantiated with a
        ## integer list listing the assays to be used to build the model, the 'toggle_no' index of the c_models to determine the regression model used
        ## and the ss iterbale to determine different sample fractions.
        c.cross_validate_model()
        c.test_model()
        ## Then the cross_validate_model() function of the parent model class is run
        ## This determines the hyperparameters for the regression model.  Finally the
        ## hyperparameters are used along with the training dataset to train the regression model in the test_model() function.
        c.save_predictions()
        ## Then the save_predictions() function defined in the x_to_assay_model parent class is used to save the assay score predictions
        ## of the test dataset to use with the assay_to_yield_model predictions.
        if 'emb' in c_models[toggle_no]:
            ## If the regression model used is of an embedded format then the save_sequence_embeddings() function defined in the
            ## x_to_assay_model parent class is run to save the sequence embeddings of the model built above.
            c.save_sequence_embeddings()
Пример #3
0
def main():

    toggle_no = int(sys.argv[1])

    c_models = [
        'ridge', 'fnn', 'emb_fnn_flat', 'emb_fnn_maxpool',
        'emb_fnn_maxpool_linear', 'emb_rnn', 'small_emb_rnn',
        'small_emb_atn_rnn', 'small_emb_rnn_linear', 'emb_cnn',
        'small_emb_cnn', 'small_emb_atn_cnn', 'small_emb_cnn_linear'
    ]
    c = modelbank.seq_to_assay_model([1, 8, 10], c_models[toggle_no], 1)
    c.cross_validate_model()
    c.test_model()
    c.save_predictions()
    c.save_sequence_embeddings()
Пример #4
0
    def __init__(self,
                 s2a_params=None,
                 e2y_params=None,
                 Nb_sequences=1000,
                 Nb_positions=16):
        # TODO: check times for different number of sequences
        'nested sampling initilization for number of sequences and number of positions of ordinals'
        # initilize default model parameters
        if e2y_params is None:
            e2y_params = ['svm', 1]
        if s2a_params is None:
            s2a_params = [[1, 8, 10], 'emb_cnn', 1]

        # note: things may change between tensorflow versions
        seed_parent = int.from_bytes(os.urandom(4), sys.byteorder)
        self.g_parent = tf.random.experimental.Generator.from_seed(seed_parent)

        self.original_seq = pd.DataFrame()

        self.original_seq['Ordinal'] = sm.make_sampling_data(
            generator=self.g_parent,
            Nb_sequences=Nb_sequences,
            Nb_positions=Nb_positions)
        self.original_seq['Developability'] = np.zeros(Nb_sequences)

        self.nb_of_sequences = Nb_sequences
        self.test_seq = self.original_seq.copy()
        # self.nb_of_sequences,_=np.shape(self.original_seq['Ordinal'])

        self.s2a = mb.seq_to_assay_model(*s2a_params)
        # i'm putting zero here b/c it requires a parameter...
        self.e2y = mb.sequence_embeding_to_yield_model(s2a_params + [0],
                                                       *e2y_params)
        self.times = pd.DataFrame()
        self.start_time = None
        self.min_yield = []
        # parent random number generator
        self.vp = []
        self.percent_pos = []
        self.vp_step = []
        self.dir_name = []

        # TODO: make a run stats file save it to the directory
        self.run_stats = pd.DataFrame({'e2y'})
Пример #5
0
def main():

    toggle_no = int(sys.argv[1])
    ## This function when run should be run on the terminal and it requires an integer input in the range [0,12].this input is stored in toggle_no
    c_models = [
        'ridge', 'fnn', 'emb_fnn_flat', 'emb_fnn_maxpool',
        'emb_fnn_maxpool_linear', 'emb_rnn', 'small_emb_rnn',
        'small_emb_atn_rnn', 'small_emb_rnn_linear', 'emb_cnn',
        'small_emb_cnn', 'small_emb_atn_cnn', 'small_emb_cnn_linear'
    ]
    ## c_models is a string list, where each element corresponds to a regression model.
    c = modelbank.seq_to_assay_model([1, 8, 10], c_models[toggle_no], 1)
    ## A seq_to_assay_model object defined in submodels_module is created and instantiated with an integer list showing the assays to be used
    ## when building the model, the 'toogl_no' index of the c_model to show the kind of regression model used an a float,1, to show the sample
    ## fraction.
    c.cross_validate_model()
    c.test_model()
    ## The cross_validate_model() and test_model() functions defined in the model parent class are run. This determines the best hyperparameters
    ## for this particular model, then the model is trained for the given hyperparameters and training dataset.
    c.save_predictions()
    c.save_sequence_embeddings()
Пример #6
0
    'Linear Model', 'One-Hot', 'Flatten AA Prop', 'Small Recurrent',
    'Small Recurrent + Atn', 'Linear Top, Small Recurrent', 'Convolutional',
    'Small Convolutional', 'Small Convolutional + Atn'
]
c_names.reverse()
## c_names is a string list where each string is the names of the bars that is to be constructed. The inital order of the c_names list is reversed.
c_mdl_test_loss, c_mdl_test_std = [], []
## Two empty lists c_mdl_test_loss and c_mdl_test_std are created to track the regression loss and the standard deviation of the loss
## for different models.

for arch in c_models:
    ## An iterbale arch is created to work through each element in the c_models list.
    c_prop = [[1, 8, 10], arch, 1]
    ## An integer list with different assays to be used to build a model, the iterable arch and a sample fraction of 1 are stored
    ## in a list c_prop
    mdl = modelbank.seq_to_assay_model(*c_prop)
    ## An object mdl, whoch is of type seq_to_assay_model is created. This object is defined in the submodel_module and it is instantiated with
    ## the elements of the c_prop list.
    c_mdl_test_loss.append(mdl.model_stats['test_avg_loss'])
    c_mdl_test_std.append(mdl.model_stats['test_std_loss'])
## The average and standard deviation of the test regression loss, saved in the test_avg_loss and test_std_loss columns in the mdl class dataframe
## model_stats is accessed and added to the c_mdl_test_loss and c_mdl_test_std lists respectively.

control_model = modelbank.control_to_assay_model([1, 8, 10], 'ridge', 1)
control_loss = control_model.model_stats['test_avg_loss']
exploded_df, _, _ = load_format_data.explode_assays([1, 8, 10],
                                                    control_model.testing_df)
exp_var = np.average(np.square(np.array(exploded_df['y_std'])))
## A new control_to_yield_model() object os created from the submodel_module.py program, the object is instantiated with a
## ridge model regression and a sample fraction of 1. The average test loss of this model is then accessed and stored in control_loss variable.
## Then the explode_yield() function of the load_format_data.py program is run using the testing_df of the mdl object. The output is stored in exploded_df.
Пример #7
0
start_time = time.time()

import submodels_module as mb
import load_format_data
import pandas as pd
import numpy as np

# set pandas df of sequences to be predicted, must contain a "Ordinal" column of paratope
# the file should be saved under /datasets/
df = ['seq_to_assay_train_1,8,10']  #this is just an example

#import sequence_to_assay model (red box)
#currently use a embedding_fnn_linear model to predict assays 1,8,10.
#will probabaly change when I find the most accurate model
s2a_params = [[1, 8, 10], 'emb_fnn_maxpool_linear', 0.01]
s2a = mb.seq_to_assay_model(*s2a_params)
#now save the sequence embeddings, file is under /datasets/predicted/learned_embedding_[model properties], col='learned_embedding'
#saves 3 different embeddings from 3 different models
s2a.save_sequence_embeddings(df)

#import the embedding_to_yield model (green blox)
#average prediction over the 3 different models, sum yield of both cell types
#currently using a ridge model, but will probably modify
e2y_params = ['ridge', 1]

predicted_yield_per_model = []
for i in range(3):
    #load model
    e2y = mb.sequence_embeding_to_yield_model(s2a_params + [i], *e2y_params)

    #save predictions from learned embeddings in s2a model
Пример #8
0
# Utilizing HT assays for training devrep, and using the devrep embedding to predict yield
# assays for training devrep are currently limited to 1,8,10 as those were the most predictive in the first paper
# Below is an example of training and testing and predicting the best performing architecture of DevRep
###


import submodels_module as modelbank

#define model parameters
#assays are numbered in order as found in SI table #
#model architectures for predicting yield are: ['ridge','fnn','emb_fnn_flat','emb_fnn_maxpool','emb_fnn_maxpool_linear','emb_rnn','small_emb_rnn','small_emb_atn_rnn','small_emb_rnn_linear',
#    										    'emb_cnn','small_emb_cnn','small_emb_atn_cnn','small_emb_cnn_linear']

devrep_mdl_param={'assays':[1,8,10], 'model_architecture':'emb_cnn', 'sample_fraction':1}
#initialize model based upon model parameters
mdl=modelbank.seq_to_assay_model(**devrep_mdl_param)

#cross-validate model
mdl.cross_validate_model()

#test the model on the limited test set
mdl.test_model()

#return the results from cv and testing
print(mdl.model_stats)

#plot the predicted results
#figure is saved in ./figures/
mdl.plot()

#save the learned embeddings