Exemplo n.º 1
0
def dump_numpy_weight_files_from_tf_ckpt(ckpt):
    print('Dumping numpy weight files for:', ckpt)
    tf.reset_default_graph()
    b = babbler(batch_size=10,
                model_path="./1900_weights")  # not the weights I want.
    b.dump_checkpoint_weights(
        ckpt, os.path.dirname(ckpt))  # dumps weights in checkpoint
Exemplo n.º 2
0
def load_base_model(model_name):
    print('Setting up base model')
    tf.reset_default_graph()

    if model_name == 'ET_Global_Init_1':
        base_model = babbler(batch_size=UNIREP_BATCH_SIZE,
                             model_path=paths.GFP_ET_GLOBAL_INIT_1_WEIGHT_PATH)
        print('Loading weights from:', paths.GFP_ET_GLOBAL_INIT_1_WEIGHT_PATH)
    elif model_name == 'ET_Global_Init_2':
        base_model = babbler(batch_size=UNIREP_BATCH_SIZE,
                             model_path=paths.GFP_ET_GLOBAL_INIT_2_WEIGHT_PATH)
        print('Loading weights from:', paths.GFP_ET_GLOBAL_INIT_2_WEIGHT_PATH)
    elif model_name == 'ET_Random_Init_1':
        base_model = babbler(batch_size=UNIREP_BATCH_SIZE,
                             model_path=paths.GFP_ET_RANDOM_INIT_1_WEIGHT_PATH)
        print('Loading weights from:', paths.GFP_ET_RANDOM_INIT_1_WEIGHT_PATH)
    elif model_name == 'OneHot':
        # Just need it to generate one-hot reps.
        # Top model created within OneHotRegressionModel doesn't actually get used.
        base_model = models.OneHotRegressionModel('EnsembledRidge')
    else:
        assert False, 'Unsupported base model'

    return base_model
Exemplo n.º 3
0
def inference_seqs(seqs, model_weight_path, batch_size=500, return_loglikes=False):
    tf.reset_default_graph()
    
    b = babbler(batch_size=batch_size, model_path=model_weight_path)
    
    with tf.Session() as sess:
        print('Initializing variables')
        sess.run(tf.global_variables_initializer())

        avg_hiddens = []
        logits = []
        for i in range(0, len(seqs), batch_size):
            print()

            min_idx = i
            max_idx = min(i+batch_size,len(seqs))
            print(min_idx, max_idx)
            
            if return_loglikes:
                ah, lg = batch_avg_hidden(b, seqs[min_idx:max_idx], sess, return_logits=return_loglikes)
                avg_hiddens.append(ah)
                logits += lg
            else:
                avg_hiddens.append(
                    batch_avg_hidden(b, seqs[min_idx:max_idx], sess))
            
    all_seq_avg_hidden = np.vstack(avg_hiddens)
    
    if return_loglikes:
        assert len(seqs) == len(logits)
        for i in range(len(seqs)):
            assert len(seqs[i])+1 == logits[i].shape[0], (len(seqs[i]), logits[i].shape[0])
        
        log_likes = np.array([calc_seq_loglike(seqs[i], logits[i]) for i in range(len(seqs))])
        return all_seq_avg_hidden, logits, log_likes
    else:
        return all_seq_avg_hidden
Exemplo n.º 4
0
    # Import the mLSTM babbler model
    from unirep import babbler64 as babbler
    
    # Where model weights are stored.
    MODEL_WEIGHT_PATH = "./64_weights"


# ## Data formatting and management

# Initialize UniRep, also referred to as the "babbler" in our code. You need to provide the batch size you will use and the path to the weight directory.

# In[3]:


batch_size = 256
b = babbler(batch_size=batch_size, model_path=MODEL_WEIGHT_PATH)


# UniRep needs to receive data in the correct format, a (batch_size, max_seq_len) matrix with integer values, where the integers correspond to an amino acid label at that position, and the end of the sequence is padded with 0s until the max sequence length to form a non-ragged rectangular matrix. We provide a formatting function to translate a string of amino acids into a list of integers with the correct codex:

# You could use your own data flow as long as you ensure that the data format is obeyed. Alternatively, you can use the data flow we've implemented for UniRep training, which happens in the tensorflow graph. It reads from a file of integer sequences, shuffles them around, collects them into groups of similar length (to minimize padding waste) and pads them to the max_length. Here's how to do that:

# First, sequences need to be saved in the correct format. Suppose we have a new-line seperated file of amino acid sequences, `seqs.txt`, and we want to format them. Note that training is currently only publicly supported for amino acid sequences less than 275 amino acids as gradient updates for sequences longer than that start to get unwieldy. If you want to train on sequences longer than this, please reach out to us. 
# 
# Sequence formatting can be done as follows:

# In[4]:


# Load jackhmmer evotune seqs.
seqlens = []
Exemplo n.º 5
0
def fit(seqs,
        vals,
        save_path="output/",
        weights_path=None,
        batch_size=256,
        model_size=64,
        end_to_end=False,
        learning_rate=.001):

    # Set seeds
    tf.set_random_seed(42)
    np.random.seed(42)

    # Download starting weights
    if weights_path is None:
        if model_size == 1900: weights_path = "models/UniRep/base1900/"
        elif model_size == 64: weights_path = "models/UniRep/base64/"
    local_weight_path = download_folder_from_s3(s3_path=weights_path,
                                                directory=LOCAL_WEIGHT_PATH,
                                                bucket=BUCKET)

    # Import babbler
    if model_size == 1900:
        from unirep import babbler1900 as babbler  # Import the mLSTM babbler model
    elif model_size == 64:
        from unirep import babbler64 as babbler  # Import the mLSTM babbler model

    # Initialize UniRep
    b = babbler(batch_size=batch_size, model_path=local_weight_path)

    # Format input
    with open("formatted.txt", "w") as destination:
        for i, (seq, val) in enumerate(zip(seqs, vals)):
            seq = seq.strip()
            if b.is_valid_seq(seq) and len(seq) < 275:
                formatted = ",".join(map(str, b.format_seq(seq)))
                formatted = str(int(round(val))) + "," + formatted
                if end_to_end:
                    formatted = formatted + "," + str(
                        25)  # append stop to end of sequence
                destination.write(formatted)
                destination.write('\n')

    # Bucket data
    bucket_op = b.bucket_batch_pad("formatted.txt",
                                   interval=1000)  # Large interval

    # Obtain all of the ops needed to output a representation
    final_hidden, x_placeholder, batch_size_placeholder, seq_length_placeholder, initial_state_placeholder = (
        b.get_rep_ops())
    # `final_hidden` should be a batch_size x rep_dim matrix.

    # Default model: train a basic feed-forward network as the top model, doing regression with MSE loss, and the Adam optimizer. We can do that by:
    # 1.  Defining a loss function.
    # 2.  Defining an optimizer that's only optimizing variables in the top model.
    # 3.  Minimizing the loss inside of a TensorFlow session
    y_placeholder = tf.placeholder(tf.float32, shape=[None, 1], name="y")
    initializer = tf.contrib.layers.xavier_initializer(uniform=False)
    with tf.variable_scope("top"):
        prediction = tf.contrib.layers.fully_connected(
            final_hidden,
            1,
            activation_fn=None,
            weights_initializer=initializer,
            biases_initializer=tf.zeros_initializer())
    loss = tf.losses.mean_squared_error(y_placeholder, prediction)

    # You can specifically train the top model first by isolating variables of the "top" scope, and forcing the optimizer to only optimize these.
    optimizer = tf.train.AdamOptimizer(learning_rate)
    if not end_to_end:
        top_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                          scope="top")
        step_op = optimizer.minimize(loss, var_list=top_variables)
    else:
        step_op = optimizer.minimize(loss)

    # We next need to define a function that allows us to calculate the length each sequence in the batch so that we know what index to use to obtain the right "final" hidden state
    def nonpad_len(batch):
        nonzero = batch > 0
        lengths = np.sum(nonzero, axis=1)
        return lengths

    # Train model
    num_iters = 50
    Loss = []
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for i in range(num_iters):
            batch = sess.run(bucket_op)
            y = batch[:, 0]
            y = list(map(lambda val: [val], y))
            batch = batch[:, 1:]
            length = nonpad_len(batch)
            loss_, __, = sess.run(
                [loss, step_op],
                feed_dict={
                    x_placeholder: batch,
                    y_placeholder: y,
                    batch_size_placeholder: batch_size,
                    seq_length_placeholder: length,
                    initial_state_placeholder: b._zero_state
                })
            Loss.append(loss_)
            print("Iteration {0}: {1}".format(i, loss_))
        save_weights(sess, save_path)

    return (Loss, save_path)
Exemplo n.º 6
0
# Somewhat out of place, but set initial sequences for simulated annealing as well
# as the mutation rate for each chain.
init_seqs = A006_common.propose_seqs([constants.AVGFP_AA_SEQ] * n_chains,
                                     [SIM_ANNEAL_INIT_SEQ_MUT_RADIUS] *
                                     n_chains,
                                     min_pos=A006_common.GFP_LIB_REGION[0],
                                     max_pos=A006_common.GFP_LIB_REGION[1])
mu_muts_per_seq = 1.5 * np.random.rand(n_chains) + 1
print('mu_muts_per_seq:', mu_muts_per_seq)  # debug

# Set up base model
print('Setting up base model')
tf.reset_default_graph()

if model_name == 'ET_Global_Init_1':
    base_model = babbler(batch_size=UNIREP_BATCH_SIZE,
                         model_path=paths.GFP_ET_GLOBAL_INIT_1_WEIGHT_PATH)
    print('Loading weights from:', paths.GFP_ET_GLOBAL_INIT_1_WEIGHT_PATH)
elif model_name == 'ET_Global_Init_2':
    base_model = babbler(batch_size=UNIREP_BATCH_SIZE,
                         model_path=paths.GFP_ET_GLOBAL_INIT_2_WEIGHT_PATH)
    print('Loading weights from:', paths.GFP_ET_GLOBAL_INIT_2_WEIGHT_PATH)
elif model_name == 'ET_Random_Init_1':
    base_model = babbler(batch_size=UNIREP_BATCH_SIZE,
                         model_path=paths.GFP_ET_RANDOM_INIT_1_WEIGHT_PATH)
    print('Loading weights from:', paths.GFP_ET_RANDOM_INIT_1_WEIGHT_PATH)
elif model_name == 'OneHot':
    # Just need it to generate one-hot reps.
    # Top model created within OneHotRegressionModel doesn't actually get used.
    base_model = models.OneHotRegressionModel('EnsembledRidge')
else:
    assert False, 'Unsupported base model'
Exemplo n.º 7
0
def is_fasta(filename):
    with open(filename, "r") as handle:
        fasta = SeqIO.parse(handle, "fasta")
        return any(fasta)


tf.set_random_seed(42)
np.random.seed(42)

fastafile = sys.argv[1]

from unirep import babbler1900 as babbler

# Where model weights are stored.
MODEL_WEIGHT_PATH = "."
b = babbler('1900_weights')

print('UniRep Encoding:')
with open(fastafile, "r") as handle:
    if is_fasta(fastafile):
        fasta = SeqIO.parse(handle, "fasta")
        d = {}
        for record in fasta:
            try:
                sequence = record.seq
                ids = record.id
                d[ids.split('|')[1]] = sequence

            except:
                print('Something wrong', '\n',
                      'fasta file should start with >sp|ID|ORGANISM')
def compute_UniRep(sequences,
                   y,
                   destination_name,
                   model_1900=True,
                   download=False):
    """
    Computes the UniRep representations of the input sequences. 
    It requires that the GitHub repo is in path and this function is adatoped from their tutorial notebook.
    GitHub repository: https://github.com/churchlab/UniRep/
    
    Arguments:
        sequences: list of sequences in string format
        y: label column
        destination_name: file name to pickle the formatted the sequences to
        model_1900: use full representation (dim=1900) or the smaller one with dim=64
        download: if True, download the model weights from aws s3 bucket
    """

    if model_1900:
        if download:
            subprocess.run([
                'aws', 's3', 'sync', '--no-sign-request', '--quiet',
                's3://unirep-public/1900_weights/', '1900_weights/'
            ])

        # Import the mLSTM babbler model
        from unirep import babbler1900 as babbler

        # Where model weights are stored.
        MODEL_WEIGHT_PATH = "./1900_weights"
    else:
        # Sync relevant weight files
        if download:
            subprocess.run([
                'aws', 's3', 'sync', '--no-sign-request', '--quiet',
                's3://unirep-public/64_weights/', '64_weights/'
            ])

        # Import the mLSTM babbler model
        from unirep import babbler64 as babbler

        # Where model weights are stored.
        MODEL_WEIGHT_PATH = "./64_weights"

    batch_size = 12
    b = babbler(batch_size=batch_size, model_path=MODEL_WEIGHT_PATH)

    UniRep_sequences = []
    fusion_sequences = []
    N = len(sequences)
    for i, seq in enumerate(sequences):
        print("Formatting sequence {}/{}".format(i + 1, N), end='\r')

        avg_hidden, final_hidden, final_cell = b.get_rep(seq)

        # save average hidden state as this is the UniRep representation
        UniRep_sequences.append([avg_hidden, y[i]])

        # concate to get UniRep-fusion representation
        fusion = np.stack((avg_hidden, final_hidden, final_cell), axis=1)
        fusion_sequences.append([fusion, y[i]])

    # create two file names for the two different representations
    split_name = destination_name.split('.')
    fusion_name = "{}_fusion.pkl".format("".join(split_name[:-1]))
    unirep_name = "{}_UniRep.pkl".format("".join(split_name[:-1]))

    # dump the lists
    with open(unirep_name, 'wb') as destination:
        pickle.dump(UniRep_sequences, destination)
    with open(fusion_name, 'wb') as destination:
        pickle.dump(fusion_sequences, destination)