def feature_encoders(self, data_dir): # This vocab file must be present within the data directory. vocab_filename = os.path.join(data_dir, "charset_size134.txt") return { "inputs": text_encoder.TextEncoder(), "targets": text_encoder.SubwordTextEncoder(vocab_filename) }
def feature_encoders(self, data_dir): del data_dir return { "inputs": dna_encoder.DNAEncoder(chunk_size=self.chunk_size), # TODO(rsepassi): RealEncoder? "targets": text_encoder.TextEncoder() }
def feature_encoders(self, data_dir): vocab_filename = os.path.join(data_dir, self.vocab_file) encoder = text_encoder.SubwordTextEncoder(vocab_filename) return { "inputs": encoder, "targets": text_encoder.TextEncoder(), }
def audio_wsj_tokens(model_hparams, wrong_vocab_size): """English audio transcription benchmark. Args: model_hparams: a tf.contrib.training.HParams wrong_vocab_size: a number used in the filename indicating the approximate vocabulary size. This is not to be confused with the actual vocabulary size. Returns: a tf.contrib.training.HParams """ p = default_problem_hparams() # This vocab file must be present within the data directory. vocab_filename = os.path.join(model_hparams.data_dir, "vocab.endefr.%d" % wrong_vocab_size) subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) p.input_modality = { "inputs": (registry.Modalities.AUDIO, None), } p.target_modality = (registry.Modalities.SYMBOL, subtokenizer.vocab_size) p.vocabulary = { "inputs": text_encoder.TextEncoder(), "targets": subtokenizer, } p.batch_size_multiplier = 512 p.loss_multiplier = 2.0 p.input_space_id = 12 p.target_space_id = 3 return p
def feature_encoders(self, data_dir): vocab_filename = os.path.join( data_dir, "vocab.endefr.%d" % self.target_vocab_size) subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) return { "inputs": text_encoder.TextEncoder(), "targets": subtokenizer, }
def image_mscoco_characters(unused_model_hparams): """COCO image captioning with captions as characters.""" p = default_problem_hparams() p.input_modality = {"inputs": (registry.Modalities.IMAGE, None)} p.target_modality = (registry.Modalities.SYMBOL, 256) p.vocabulary = { "inputs": text_encoder.TextEncoder(), "targets": text_encoder.ByteTextEncoder(), } p.batch_size_multiplier = 128 p.max_expected_batch_size_per_shard = 2 p.loss_multiplier = 2.0 p.input_space_id = 1 p.target_space_id = 2 return p
def image_mscoco_tokens(model_hparams, vocab_count): """COCO image captioning with captions as tokens.""" p = default_problem_hparams() p.input_modality = {"inputs": (registry.Modalities.IMAGE, None)} # This vocab file must be present within the data directory. vocab_filename = os.path.join(model_hparams.data_dir, "vocab.endefr.%d" % vocab_count) subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) p.target_modality = (registry.Modalities.SYMBOL, subtokenizer.vocab_size) p.vocabulary = { "inputs": text_encoder.TextEncoder(), "targets": subtokenizer, } p.batch_size_multiplier = 256 p.max_expected_batch_size_per_shard = 2
def image_mscoco_characters(model_hparams): """COCO image captioning with captions as characters.""" p = default_problem_hparams() p.input_modality = {"inputs": modality.ImageModality(model_hparams)} p.target_modality = modality.SymbolModality(model_hparams, 256) p.vocabulary = { "inputs": text_encoder.TextEncoder(), "targets": text_encoder.ByteTextEncoder(), } p.batch_size_multiplier = 128 p.max_expected_batch_size_per_shard = 2 p.loss_multiplier = 2.0 p.input_space_id = 1 p.target_space_id = 2 return p
def audio_wsj_characters(model_hparams): """English audio transcription benchmark.""" p = default_problem_hparams() p.input_modality = { "inputs": modality.AudioSpectralModality(model_hparams), } p.target_modality = modality.SymbolModality(model_hparams, 256) p.vocabulary = { "inputs": text_encoder.TextEncoder(), "targets": text_encoder.ByteTextEncoder(), } p.batch_size_multiplier = 512 p.loss_multiplier = 2.0 p.input_space_id = 13 p.target_space_id = 2 return p
def audio_timit_characters(unused_model_hparams): """English audio transcription benchmark.""" p = default_problem_hparams() p.input_modality = { "inputs": (registry.Modalities.AUDIO, None), } p.target_modality = (registry.Modalities.SYMBOL, 256) p.vocabulary = { "inputs": text_encoder.TextEncoder(), "targets": text_encoder.ByteTextEncoder(), } p.batch_size_multiplier = 256 p.loss_multiplier = 2.0 p.input_space_id = 12 p.target_space_id = 2 return p
def image_mscoco_tokens(model_hparams, vocab_count): """COCO image captioning with captions as tokens.""" p = default_problem_hparams() p.input_modality = {"inputs": modality.ImageModality(model_hparams)} # This vocab file must be present within the data directory. vocab_filename = os.path.join(model_hparams.data_dir, "tokens.vocab.%d" % vocab_count) subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) p.target_modality = modality.SymbolModality(model_hparams, subtokenizer.vocab_size) p.vocabulary = { "inputs": text_encoder.TextEncoder(), "targets": subtokenizer, } p.batch_size_multiplier = 256 p.max_expected_batch_size_per_shard = 2 p.input_space_id = 1 p.target_space_id = 3 return p
def hparams(self, defaults, model_hparams): p = defaults p.input_modality = {"inputs": (registry.Modalities.IMAGE, None)} # This vocab file must be present within the data directory. vocab_filename = os.path.join(model_hparams.data_dir, "charset_size134.txt") subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) p.target_modality = (registry.Modalities.SYMBOL, subtokenizer.vocab_size) p.vocabulary = { "inputs": text_encoder.TextEncoder(), "targets": subtokenizer, } p.batch_size_multiplier = 256 p.max_expected_batch_size_per_shard = 2 vocab_size = 144 p.input_modality = {"inputs": (registry.Modalities.SYMBOL, vocab_size)} p.target_modality = (registry.Modalities.SYMBOL, vocab_size) p.input_space_id = problem.SpaceID.DIGIT_0 p.target_space_id = problem.SpaceID.DIGIT_1
def default_problem_hparams(): """A set of basic model hyperparameters.""" return tf.contrib.training.HParams( # Use this parameter to get comparable perplexity numbers with different # tokenizations. This value should be set to the ratio of the number of # tokens in the test set according to the tokeization used to the number # of tokens in the test set in the "official" tokenization. For example, # if we are using a word-piece based model and we want to compute # per-word perplexity, then we set loss_multiplier to the number of # wordpieces per word in the test set. loss_multiplier=1.0, # Use this parameter to allow for larger sequences in the batch. Without # the use of this parameter, the size of the inner two dimensions will be # used to judge the sequence length. batch_size_multiplier=1, # To make queues of the right capacity, it's good to know the maximal # expected batch size, as it can vary a lot. It only affects performance # of input readers and memory use. The defaults should be safe and fast, # but decrease if your reader uses a lot of memory and increase if slow. max_expected_batch_size_per_shard=64, # Modalities used to map from input features to a space compatible with # chosen model architecture. One modality spec (which is a 2-tuple, # (modality_full_name, vocab_size)) per feature key. modality_full_name is # a string type:name, e.g. class_label:2d. Leaving off the name uses the # default modality for that type (e.g. class_label == # class_label:default). input_modality={}, # Modality used to map from hidden representation to the target space. # Specified as a modality spec, a 2-tuple described above. target_modality=None, # Identifiers used to tell the model which input/target space will be # expected. For example, it can tell that we expect French as characters # as output, or Spanish as sound. An integer with the following semantics: # 0: Generic / unknown output space (default) # 1: Image labels # 2: English characters # 3: English tokens # 4: English bpe tokens # 5: French characters # 6: French tokens # 7: German characters # 8: German tokens # 9: German bpe tokens # 10: Digit cipher lexicon 0 # 11: Digit cipher lexicon 1 # 12: Audio waveform domain # 13: Audio spectral domain # 14: Parse characters # 15: Parse tokens # 16: Chinese tokens # 17: Icelandic characters # 18: Icelandic tokens # 19: Icelandic parse tokens # 20: Macedonian tokens # 21: Czech tokens # 22: Czech characters # Add more above if needed. input_space_id=0, target_space_id=0, # Vocabulary per feature key. # a vocabulary converts to/from human-readable strings. # E.g. {"inputs": text_encoder.ByteTextEncoder(), # "targets": text_encoder.SubwordTextEncoder("vocab_filename.txt")} vocabulary={ "inputs": text_encoder.TextEncoder(), "targets": text_encoder.TextEncoder() }, # This is a marker to keep track if the problem was reversed or copied. # Only set automatically, do not override the default. # # These tags can be combined in order to perform copies of the input or # the targets. For instance `problem_copy` will copy the inputs, but # `problem_rev_copy` will copy the targets. was_reversed=False, was_copy=False, )
def feature_encoders(self, _): return { "inputs": text_encoder.TextEncoder(), "targets": text_encoder.ByteTextEncoder(), }
def feature_encoders(self, data_dir): del data_dir return { "inputs": text_encoder.TextEncoder(), "targets": text_encoder.ClassLabelEncoder(self.class_labels) }
def feature_encoders(self, _): return { "inputs": text_encoder.TextEncoder(), "targets": LibrispeechTextEncoder(), }
def feature_encoders(self, data_dir): del data_dir return { "inputs": text_encoder.TextEncoder(), "targets": text_encoder.TextEncoder() }
def feature_encoders(self, data_dir): encoder = text_encoder.TextEncoder() return { "inputs": encoder, "targets": text_encoder.ClassLabelEncoder(self.class_labels()) }