tf.sg_arg_def(frac=(1.0, "test fraction ratio to whole data set. The default is 1.0(=whole set)")) # # hyper parameters # # batch size batch_size = 16 # # inputs # # corpus input tensor ( with QueueRunner ) data = SpeechCorpus(batch_size=batch_size, set_name=tf.sg_arg().set) # mfcc feature of audio x = data.mfcc # target sentence label y = data.label # sequence length except zero-padding seq_len = tf.not_equal(x.sg_sum(axis=2), 0.).sg_int().sg_sum(axis=1) # # Testing Graph # # encode audio feature logit = get_logit(x, voca_size=voca_size)
"test fraction ratio to whole data set. The default is 1.0(=whole set)")) # # hyper parameters # # batch size batch_size = 16 # # inputs # # corpus input tensor ( with QueueRunner ) print 'building corpus' data = SpeechCorpus(batch_size=batch_size, set_name=tf.sg_arg().set) # mfcc feature of audio x = data.mfcc # target sentence label y = data.label filenames_t = data.filenames # sequence length except zero-padding seq_len = tf.not_equal(x.sg_sum(axis=2), 0.).sg_int().sg_sum(axis=1) # # Testing Graph #
seq_len, merge_repeated=False) # to dense tensor y = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values) + 1 # # regcognize wave file # # command line argument for input wave file path tf.sg_arg_def(file=('', 'speech wave file to recognize.')) # load wave file wav, _ = librosa.load(tf.sg_arg().file, mono=True, sr=16000) # get mfcc feature mfcc = np.transpose(np.expand_dims(librosa.feature.mfcc(wav, 16000), axis=0), [0, 2, 1]) # run network with tf.Session() as sess: # init variables tf.sg_init(sess) # restore parameters saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint('asset/train')) # run session label = sess.run(y, feed_dict={x: mfcc})
#decoded, _ = tf.nn.ctc_beam_search_decoder(logit.sg_transpose(perm=[1, 0, 2]), seq_len, merge_repeated=False) # to dense tensor #y = tf.sparse_to_dense(decoded[0].indices, decoded[0].shape, decoded[0].values) + 1 y = tf.nn.softmax(logit) # # regcognize wave file # # command line argument for input wave file path tf.sg_arg_def(file=('', 'speech wave file to recognize.')) # load wave file wav, sr = librosa.load(tf.sg_arg().file, mono=True) # get mfcc feature mfcc = np.transpose( np.expand_dims(librosa.feature.mfcc(wav, sr, n_mfcc=20), axis=0), [0, 2, 1]) mfcc = mfcc[:, :100] # run network with tf.Session() as sess: # init variables tf.sg_init(sess) # restore parameters saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint('asset/train/ckpt'))
import data tf.sg_verbosity(10) batch_size = 1 # batch size vocab_size = data.vocab_size x = tf.placeholder(dtype=tf.sg_floatx, shape=(batch_size, None, 20)) seq_len = tf.not_equal(x.sg_sum(axis=2), 0.).sg_int().sg_sum(axis=1) logit = get_logit(x, voca_size=vocab_size) decoded, _ = tf.nn.ctc_beam_search_decoder(logit.sg_transpose(perm=[1, 0, 2]), seq_len, merge_repeated=False) y = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values) + 1 tf.sg_arg_def(file=('', 'speech wave file to recognize.')) wav, _ = librosa.load(tf.sg_arg().file, mono=True, sr=16000) mfcc = np.transpose(np.expand_dims(librosa.feature.mfcc(wav, 16000), axis=0), [0, 2, 1]) with tf.Session() as sess: tf.sg_init(sess) saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint('asset/train')) label = sess.run(y, feed_dict={x: mfcc}) data.print_index(label)
tf.sg_arg_def(frac=(1.0, "test fraction ratio to whole data set. The default is 1.0(=whole set)")) # # hyper parameters # # batch size batch_size = 16 # # inputs # # corpus input tensor ( with QueueRunner ) data = SpeechCorpus(batch_size=batch_size, set_name=tf.sg_arg().set) # mfcc feature of audio x = data.mfcc # target sentence label y = data.label # sequence length except zero-padding seq_len = tf.not_equal(x.sg_sum(axis=2), 0.).sg_int().sg_sum(axis=1) # # Testing Graph # # encode audio feature logit = get_logit(x, voca_size=voca_size)