Exemplo n.º 1
0
  def __init__(self,
               masked_lm_prob = 0.15,
               mask_token_prob = 0.80,
               resample_token_prob = 0.50,
               deterministic_n_lm_tokens = True,
               **kwargs):
    super().__init__(**kwargs)
    self._masked_lm_prob = masked_lm_prob
    self._mask_token_prob = mask_token_prob
    self._resample_token_prob = resample_token_prob
    self._deterministic_n_lm_tokens = deterministic_n_lm_tokens

    # tf.where-based "branching" for BERT's Cloze task
    self._branch1_sampler = (
        tfp.distributions.Uniform() if self._deterministic_n_lm_tokens
        else tfp.distributions.Bernoulli(
            probs=self._masked_lm_prob, dtype=tf.bool)
        )
    self._branch2_sampler = tfp.distributions.Bernoulli(
        probs=self._mask_token_prob, dtype=tf.bool)
    self._branch3_sampler = tfp.distributions.Bernoulli(
        probs=self._resample_token_prob, dtype=tf.bool)

    # Resample (integer-valued) tokens uniformly at random, ignoring any special
    # tokens in the vocabulary
    self._resample_sampler = vocabulary.Sampler(self._vocab)
Exemplo n.º 2
0
def make_fake_sequence_dataset(num_examples = 1000):
  voc = vocabulary.alternative
  sampler = vocabulary.Sampler(voc)
  ds = tf.data.Dataset.from_tensor_slices({
      'sequence': sampler.sample((num_examples, 128)),
      'seq_key': tf.range(num_examples, dtype=tf.int32),
      'fam_key': tf.range(num_examples, 2 * num_examples, dtype=tf.int32),
  })
  return ds
Exemplo n.º 3
0
def make_fake_homology_dataset(num_examples=1000, seq_len=128):
    voc = vocabulary.proteins
    sampler = vocabulary.Sampler(voc)
    return tf.data.Dataset.from_tensor_slices({
        'sequence':
        sampler.sample((num_examples, seq_len)),
        'target':
        tf.random.uniform(shape=(num_examples, )) > 0.8,
        'weights':
        tf.ones(shape=(num_examples, seq_len), dtype=tf.float32),
    })
 def __init__(self,
              max_len=512,
              len_increase_ratio=2.0,
              logits=None,
              gap_token='-',
              **kwargs):
     super().__init__(**kwargs)
     self._max_len = max_len
     self._len_increase_ratio = len_increase_ratio
     self._sampler = vocabulary.Sampler(
         vocab=self._vocab,
         logits=self.PFAM_LOGITS if logits is None else logits)
     self._gap_token = gap_token
     self._gap_code = self._vocab.get(self._gap_token)
Exemplo n.º 5
0
  def __init__(self,
               max_len = 512,
               tau = 0.01,
               alpha = 0.05,
               eta = 0.7,
               vocab = None):
    self._max_len = max_len
    vocab = vocabulary.get_default() if vocab is None else vocab
    self._sampler = vocabulary.Sampler(vocab=vocab)
    self._eos = vocab.get(vocab.specials[-1])
    self._pad = vocab.padding_code

    # Transition look-up table (excluding special initial transition).
    look_up = {
        (self.MATCH, self.MATCH): 1,
        (self.GAP_IN_X, self.MATCH): 2,
        (self.GAP_IN_Y, self.MATCH): 3,
        (self.MATCH, self.GAP_IN_X): 4,
        (self.GAP_IN_X, self.GAP_IN_X): 5,
        (self.GAP_IN_Y, self.GAP_IN_X): 9,  # "forbidden" transition.
        (self.MATCH, self.GAP_IN_Y): 6,
        (self.GAP_IN_X, self.GAP_IN_Y): 7,
        (self.GAP_IN_Y, self.GAP_IN_Y): 8,
    }
    # Builds data structures for efficiently encoding transitions.
    self._hash_fn = lambda d0, d1: 3 * (d1 + 1) + (d0 + 1)
    hashes = [self._hash_fn(d0, d1) for (d0, d1) in look_up]
    trans_encoder = tf.scatter_nd(
        indices=[[x] for x in hashes],
        updates=list(look_up.values()),
        shape=[max(hashes) + 1])
    self._trans_encoder = tf.cast(trans_encoder, tf.int32)
    self._init_trans = tf.convert_to_tensor([self.INIT_TRANS], dtype=tf.int32)

    cond_probs = tf.convert_to_tensor(
        [[0.0, 1.0, 0.0, 0.0, 0.0],
         [0.0, 1.0 - 2.0 * alpha - tau, alpha, alpha, tau],
         [0.0, eta, 1.0 - eta - alpha, alpha, 0.0],
         [0.0, eta, 0.0, 1.0 - eta, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0]],
        tf.float32)
    self._logits = tf.where(cond_probs > 0.0, tf.math.log(cond_probs), -np.inf)

    self._delta_len_x = tf.convert_to_tensor([0, 1, 0, 1, 0])
    self._delta_len_y = tf.convert_to_tensor([0, 1, 1, 0, 0])
Exemplo n.º 6
0
def make_fake_dataset(num_examples=1000):
    voc = vocabulary.proteins
    sampler = vocabulary.Sampler(voc)
    ds = tf.data.Dataset.from_tensor_slices(sampler.sample(
        (num_examples, 128)))
    return ds.map(lambda x: {'sequence': x})
 def setUp(self):
     super().setUp()
     gin.clear_config()
     tf.random.set_seed(0)
     self.sampler = vocabulary.Sampler()
     self.seq = self.sampler.sample((256, ))
 def setUp(self):
     super().setUp()
     tf.random.set_seed(0)
     self.vocab = vocabulary.alternative
     self.sampler = vocabulary.Sampler(vocab=self.vocab)