def dump_numpy_weight_files_from_tf_ckpt(ckpt): print('Dumping numpy weight files for:', ckpt) tf.reset_default_graph() b = babbler(batch_size=10, model_path="./1900_weights") # not the weights I want. b.dump_checkpoint_weights( ckpt, os.path.dirname(ckpt)) # dumps weights in checkpoint
def load_base_model(model_name): print('Setting up base model') tf.reset_default_graph() if model_name == 'ET_Global_Init_1': base_model = babbler(batch_size=UNIREP_BATCH_SIZE, model_path=paths.GFP_ET_GLOBAL_INIT_1_WEIGHT_PATH) print('Loading weights from:', paths.GFP_ET_GLOBAL_INIT_1_WEIGHT_PATH) elif model_name == 'ET_Global_Init_2': base_model = babbler(batch_size=UNIREP_BATCH_SIZE, model_path=paths.GFP_ET_GLOBAL_INIT_2_WEIGHT_PATH) print('Loading weights from:', paths.GFP_ET_GLOBAL_INIT_2_WEIGHT_PATH) elif model_name == 'ET_Random_Init_1': base_model = babbler(batch_size=UNIREP_BATCH_SIZE, model_path=paths.GFP_ET_RANDOM_INIT_1_WEIGHT_PATH) print('Loading weights from:', paths.GFP_ET_RANDOM_INIT_1_WEIGHT_PATH) elif model_name == 'OneHot': # Just need it to generate one-hot reps. # Top model created within OneHotRegressionModel doesn't actually get used. base_model = models.OneHotRegressionModel('EnsembledRidge') else: assert False, 'Unsupported base model' return base_model
def inference_seqs(seqs, model_weight_path, batch_size=500, return_loglikes=False): tf.reset_default_graph() b = babbler(batch_size=batch_size, model_path=model_weight_path) with tf.Session() as sess: print('Initializing variables') sess.run(tf.global_variables_initializer()) avg_hiddens = [] logits = [] for i in range(0, len(seqs), batch_size): print() min_idx = i max_idx = min(i+batch_size,len(seqs)) print(min_idx, max_idx) if return_loglikes: ah, lg = batch_avg_hidden(b, seqs[min_idx:max_idx], sess, return_logits=return_loglikes) avg_hiddens.append(ah) logits += lg else: avg_hiddens.append( batch_avg_hidden(b, seqs[min_idx:max_idx], sess)) all_seq_avg_hidden = np.vstack(avg_hiddens) if return_loglikes: assert len(seqs) == len(logits) for i in range(len(seqs)): assert len(seqs[i])+1 == logits[i].shape[0], (len(seqs[i]), logits[i].shape[0]) log_likes = np.array([calc_seq_loglike(seqs[i], logits[i]) for i in range(len(seqs))]) return all_seq_avg_hidden, logits, log_likes else: return all_seq_avg_hidden
# Import the mLSTM babbler model from unirep import babbler64 as babbler # Where model weights are stored. MODEL_WEIGHT_PATH = "./64_weights" # ## Data formatting and management # Initialize UniRep, also referred to as the "babbler" in our code. You need to provide the batch size you will use and the path to the weight directory. # In[3]: batch_size = 256 b = babbler(batch_size=batch_size, model_path=MODEL_WEIGHT_PATH) # UniRep needs to receive data in the correct format, a (batch_size, max_seq_len) matrix with integer values, where the integers correspond to an amino acid label at that position, and the end of the sequence is padded with 0s until the max sequence length to form a non-ragged rectangular matrix. We provide a formatting function to translate a string of amino acids into a list of integers with the correct codex: # You could use your own data flow as long as you ensure that the data format is obeyed. Alternatively, you can use the data flow we've implemented for UniRep training, which happens in the tensorflow graph. It reads from a file of integer sequences, shuffles them around, collects them into groups of similar length (to minimize padding waste) and pads them to the max_length. Here's how to do that: # First, sequences need to be saved in the correct format. Suppose we have a new-line seperated file of amino acid sequences, `seqs.txt`, and we want to format them. Note that training is currently only publicly supported for amino acid sequences less than 275 amino acids as gradient updates for sequences longer than that start to get unwieldy. If you want to train on sequences longer than this, please reach out to us. # # Sequence formatting can be done as follows: # In[4]: # Load jackhmmer evotune seqs. seqlens = []
def fit(seqs, vals, save_path="output/", weights_path=None, batch_size=256, model_size=64, end_to_end=False, learning_rate=.001): # Set seeds tf.set_random_seed(42) np.random.seed(42) # Download starting weights if weights_path is None: if model_size == 1900: weights_path = "models/UniRep/base1900/" elif model_size == 64: weights_path = "models/UniRep/base64/" local_weight_path = download_folder_from_s3(s3_path=weights_path, directory=LOCAL_WEIGHT_PATH, bucket=BUCKET) # Import babbler if model_size == 1900: from unirep import babbler1900 as babbler # Import the mLSTM babbler model elif model_size == 64: from unirep import babbler64 as babbler # Import the mLSTM babbler model # Initialize UniRep b = babbler(batch_size=batch_size, model_path=local_weight_path) # Format input with open("formatted.txt", "w") as destination: for i, (seq, val) in enumerate(zip(seqs, vals)): seq = seq.strip() if b.is_valid_seq(seq) and len(seq) < 275: formatted = ",".join(map(str, b.format_seq(seq))) formatted = str(int(round(val))) + "," + formatted if end_to_end: formatted = formatted + "," + str( 25) # append stop to end of sequence destination.write(formatted) destination.write('\n') # Bucket data bucket_op = b.bucket_batch_pad("formatted.txt", interval=1000) # Large interval # Obtain all of the ops needed to output a representation final_hidden, x_placeholder, batch_size_placeholder, seq_length_placeholder, initial_state_placeholder = ( b.get_rep_ops()) # `final_hidden` should be a batch_size x rep_dim matrix. # Default model: train a basic feed-forward network as the top model, doing regression with MSE loss, and the Adam optimizer. We can do that by: # 1. Defining a loss function. # 2. Defining an optimizer that's only optimizing variables in the top model. # 3. Minimizing the loss inside of a TensorFlow session y_placeholder = tf.placeholder(tf.float32, shape=[None, 1], name="y") initializer = tf.contrib.layers.xavier_initializer(uniform=False) with tf.variable_scope("top"): prediction = tf.contrib.layers.fully_connected( final_hidden, 1, activation_fn=None, weights_initializer=initializer, biases_initializer=tf.zeros_initializer()) loss = tf.losses.mean_squared_error(y_placeholder, prediction) # You can specifically train the top model first by isolating variables of the "top" scope, and forcing the optimizer to only optimize these. optimizer = tf.train.AdamOptimizer(learning_rate) if not end_to_end: top_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="top") step_op = optimizer.minimize(loss, var_list=top_variables) else: step_op = optimizer.minimize(loss) # We next need to define a function that allows us to calculate the length each sequence in the batch so that we know what index to use to obtain the right "final" hidden state def nonpad_len(batch): nonzero = batch > 0 lengths = np.sum(nonzero, axis=1) return lengths # Train model num_iters = 50 Loss = [] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for i in range(num_iters): batch = sess.run(bucket_op) y = batch[:, 0] y = list(map(lambda val: [val], y)) batch = batch[:, 1:] length = nonpad_len(batch) loss_, __, = sess.run( [loss, step_op], feed_dict={ x_placeholder: batch, y_placeholder: y, batch_size_placeholder: batch_size, seq_length_placeholder: length, initial_state_placeholder: b._zero_state }) Loss.append(loss_) print("Iteration {0}: {1}".format(i, loss_)) save_weights(sess, save_path) return (Loss, save_path)
# Somewhat out of place, but set initial sequences for simulated annealing as well # as the mutation rate for each chain. init_seqs = A006_common.propose_seqs([constants.AVGFP_AA_SEQ] * n_chains, [SIM_ANNEAL_INIT_SEQ_MUT_RADIUS] * n_chains, min_pos=A006_common.GFP_LIB_REGION[0], max_pos=A006_common.GFP_LIB_REGION[1]) mu_muts_per_seq = 1.5 * np.random.rand(n_chains) + 1 print('mu_muts_per_seq:', mu_muts_per_seq) # debug # Set up base model print('Setting up base model') tf.reset_default_graph() if model_name == 'ET_Global_Init_1': base_model = babbler(batch_size=UNIREP_BATCH_SIZE, model_path=paths.GFP_ET_GLOBAL_INIT_1_WEIGHT_PATH) print('Loading weights from:', paths.GFP_ET_GLOBAL_INIT_1_WEIGHT_PATH) elif model_name == 'ET_Global_Init_2': base_model = babbler(batch_size=UNIREP_BATCH_SIZE, model_path=paths.GFP_ET_GLOBAL_INIT_2_WEIGHT_PATH) print('Loading weights from:', paths.GFP_ET_GLOBAL_INIT_2_WEIGHT_PATH) elif model_name == 'ET_Random_Init_1': base_model = babbler(batch_size=UNIREP_BATCH_SIZE, model_path=paths.GFP_ET_RANDOM_INIT_1_WEIGHT_PATH) print('Loading weights from:', paths.GFP_ET_RANDOM_INIT_1_WEIGHT_PATH) elif model_name == 'OneHot': # Just need it to generate one-hot reps. # Top model created within OneHotRegressionModel doesn't actually get used. base_model = models.OneHotRegressionModel('EnsembledRidge') else: assert False, 'Unsupported base model'
def is_fasta(filename): with open(filename, "r") as handle: fasta = SeqIO.parse(handle, "fasta") return any(fasta) tf.set_random_seed(42) np.random.seed(42) fastafile = sys.argv[1] from unirep import babbler1900 as babbler # Where model weights are stored. MODEL_WEIGHT_PATH = "." b = babbler('1900_weights') print('UniRep Encoding:') with open(fastafile, "r") as handle: if is_fasta(fastafile): fasta = SeqIO.parse(handle, "fasta") d = {} for record in fasta: try: sequence = record.seq ids = record.id d[ids.split('|')[1]] = sequence except: print('Something wrong', '\n', 'fasta file should start with >sp|ID|ORGANISM')
def compute_UniRep(sequences, y, destination_name, model_1900=True, download=False): """ Computes the UniRep representations of the input sequences. It requires that the GitHub repo is in path and this function is adatoped from their tutorial notebook. GitHub repository: https://github.com/churchlab/UniRep/ Arguments: sequences: list of sequences in string format y: label column destination_name: file name to pickle the formatted the sequences to model_1900: use full representation (dim=1900) or the smaller one with dim=64 download: if True, download the model weights from aws s3 bucket """ if model_1900: if download: subprocess.run([ 'aws', 's3', 'sync', '--no-sign-request', '--quiet', 's3://unirep-public/1900_weights/', '1900_weights/' ]) # Import the mLSTM babbler model from unirep import babbler1900 as babbler # Where model weights are stored. MODEL_WEIGHT_PATH = "./1900_weights" else: # Sync relevant weight files if download: subprocess.run([ 'aws', 's3', 'sync', '--no-sign-request', '--quiet', 's3://unirep-public/64_weights/', '64_weights/' ]) # Import the mLSTM babbler model from unirep import babbler64 as babbler # Where model weights are stored. MODEL_WEIGHT_PATH = "./64_weights" batch_size = 12 b = babbler(batch_size=batch_size, model_path=MODEL_WEIGHT_PATH) UniRep_sequences = [] fusion_sequences = [] N = len(sequences) for i, seq in enumerate(sequences): print("Formatting sequence {}/{}".format(i + 1, N), end='\r') avg_hidden, final_hidden, final_cell = b.get_rep(seq) # save average hidden state as this is the UniRep representation UniRep_sequences.append([avg_hidden, y[i]]) # concate to get UniRep-fusion representation fusion = np.stack((avg_hidden, final_hidden, final_cell), axis=1) fusion_sequences.append([fusion, y[i]]) # create two file names for the two different representations split_name = destination_name.split('.') fusion_name = "{}_fusion.pkl".format("".join(split_name[:-1])) unirep_name = "{}_UniRep.pkl".format("".join(split_name[:-1])) # dump the lists with open(unirep_name, 'wb') as destination: pickle.dump(UniRep_sequences, destination) with open(fusion_name, 'wb') as destination: pickle.dump(fusion_sequences, destination)