Exemplo n.º 1
0
  def __init__(
      self, sess, dataset, dataset_size, logdir, ckpt, decoder_params,
      model_params):
    self.decoder_params = decoder_params

    if self.decoder_params.token_model:
      self.token_model = token_model.TokenModel(
          self.decoder_params.token_model)
    else:
      self.token_model = token_model.TokenModel(
          "speech4/conf/token_model_character_simple.pbtxt")

    self.model_params = model_params
    self.model_params.attention_params.type = "median"
    self.model_params.attention_params.median_window_l = 10
    self.model_params.attention_params.median_window_r = 100
    self.model_params.tokens_len_max = 1
    self.model_params.input_layer = "placeholder"

    self.dataset = dataset
    self.dataset_size = dataset_size

    self.logdir = logdir

    with tf.variable_scope("model"):
      self.model = las_model.LASModel(
          sess, dataset, logdir, ckpt, True, self.decoder_params.beam_width,
          self.model_params)

    # Graph to read 1 utterance.
    tf.train.string_input_producer([dataset])
    reader = tf.TFRecordReader()
    filename_queue = tf.train.string_input_producer([self.dataset])
    _, serialized = reader.read(filename_queue)
    serialized = tf.train.batch(
        [serialized], batch_size=1, num_threads=2, capacity=2)

    self.features, _, self.features_len, _, _, self.text, _, _, _, _, self.uttid = s4_parse_utterance(
        serialized, features_len_max=self.model_params.features_len_max,
        tokens_len_max=1)
Exemplo n.º 2
0
  def __init__(
      self, sess, dataset, dataset_size, logdir, ckpt, decoder_params,
      model_params):
    self.decoder_params = decoder_params

    if self.decoder_params.token_model:
      self.token_model = token_model.TokenModel(
          self.decoder_params.token_model)
    else:
      self.token_model = token_model.TokenModel(
          "speech4/conf/token_model_character_simple.pbtxt")

    self.model_params = model_params
    self.model_params.attention_params.type = "median"
    self.model_params.attention_params.median_window_l = 10
    self.model_params.attention_params.median_window_r = 100
    self.model_params.input_layer = "decoder"

    self.dataset = dataset
    self.dataset_size = dataset_size

    self.logdir = logdir

    with tf.variable_scope("model"):
      self.model = las_model.LASModel(
          sess, dataset, logdir, ckpt, True, self.decoder_params.beam_width,
          self.model_params)

    # Graph to read 1 utterance.
    tf.train.string_input_producer([dataset])
    reader = tf.TFRecordReader()
    filename_queue = tf.train.string_input_producer([self.dataset])
    _, serialized = reader.read(filename_queue)
    serialized = tf.train.batch(
        [serialized], batch_size=1, num_threads=2, capacity=2)

    #self.features, _, self.features_len, _, _, self.text, _, _, _, _, self.uttid = s4_parse_utterance(
    #    serialized, features_len_max=self.model_params.features_len_max,
    #    tokens_len_max=1)
    self.features, self.features_fbank, self.features_len, _, self.features_weight, self.text, self.tokens, self.tokens_pinyin, self.tokens_len, self.tokens_weights, self.tokens_pinyin_weights, self.uttid = s4_parse_utterance(
        serialized, features_len_max=self.model_params.features_len_max,
        tokens_len_max=self.model_params.tokens_len_max + 1,
        frame_stack=self.model_params.frame_stack,
        frame_skip=self.model_params.frame_skip)

    # Add the shape to the features.
    for feature in self.features:
      feature.set_shape([1, self.model_params.features_width * self.model_params.frame_stack])
    for token in self.tokens:
      if token:
        token.set_shape([1])
Exemplo n.º 3
0
  def create_graph_inputs(self):
    dataset_map = {}
    if self.dataset == 'train_si284':
      self.dataset = 'speech4/data/train_si284.tfrecords'
      self.dataset_size = 37416
    elif self.dataset == 'test_dev93':
      self.dataset = 'speech4/data/test_dev93.tfrecords'
      self.dataset_size = 503
    elif self.dataset == 'test_eval92':
      self.dataset = 'speech4/data/test_eval92.tfrecords'
      self.dataset_size = 333
    elif self.dataset == 'ptb_train':
      self.dataset = 'speech4/data/ptb_train.tfrecords'
      self.dataset_size = 42068
    elif self.dataset == 'ptb_valid':
      self.dataset = 'speech4/data/ptb_valid.tfrecords'
      self.dataset_size = 3370
    elif self.dataset == 'ptb_test':
      self.dataset = 'speech4/data/ptb_test.tfrecords'
      self.dataset_size = 3761
    filename_queue = tf.train.string_input_producer([self.dataset])

    reader = tf.TFRecordReader()
    _, serialized = reader.read(filename_queue)

    serialized = tf.train.shuffle_batch(
        [serialized], batch_size=self.batch_size, num_threads=2, capacity=self.batch_size * 4 + 512,
        min_after_dequeue=512, seed=1000)
    
    # Parse the batched of serialized strings into the relevant utterance features.
    self.features, self.features_len, _, self.text, self.tokens, self.tokens_len, self.tokens_weights, self.uttid = s4_parse_utterance(
        serialized, features_len_max=self.features_len_max,
        tokens_len_max=self.tokens_len_max + 1)

    # Add the shape to the features.
    for feature in self.features:
      feature.set_shape([self.batch_size, self.features_width])
    for token in self.tokens:
      token.set_shape([self.batch_size])