def make_protein_domain(include_anomalous_amino_acids=True, include_bos=True, include_eos=True, include_pad=True, include_mask=True, length=1024): return domains.VariableLengthDiscreteDomain( vocab=domains.ProteinVocab( include_anomalous_amino_acids=include_anomalous_amino_acids, include_bos=include_bos, include_eos=include_eos, include_pad=include_pad, include_mask=include_mask), length=length, )
def setUp(self): cls = functools.partial(models.FlaxBERT, **lm_cfg) self._domain = domains.VariableLengthDiscreteDomain( vocab=domains.ProteinVocab( include_anomalous_amino_acids=True, include_bos=True, include_eos=True, include_pad=True, include_mask=True), length=3) lm = cls(domain=self._domain, grad_clip=1.0) self.lm = lm self.xs = np.array([ [1, 1, 0], ]) super().setUp()
"""Dataset preprocessing and pipeline. Built for Trembl dataset. """ import os import types from absl import logging import gin import numpy as np import tensorflow.compat.v1 as tf from protein_lm import domains protein_domain = domains.VariableLengthDiscreteDomain( vocab=domains.ProteinVocab(include_anomalous_amino_acids=True, include_bos=True, include_eos=True), length=512) def dataset_from_tensors(tensors): """Converts nested tf.Tensors or np.ndarrays to a tf.Data.Dataset.""" if isinstance(tensors, types.GeneratorType) or isinstance(tensors, list): tensors = tuple(tensors) return tf.data.Dataset.from_tensor_slices(tensors) def _parse_example(value): parsed = tf.parse_single_example( value, features={'sequence': tf.io.VarLenFeature(tf.int64)}) sequence = tf.sparse.to_dense(parsed['sequence'])