tf.sg_arg_def(frac=(1.0, "test fraction ratio to whole data set. The default is 1.0(=whole set)"))


#
# hyper parameters
#

# batch size
batch_size = 16

#
# inputs
#

# corpus input tensor ( with QueueRunner )
data = SpeechCorpus(batch_size=batch_size, set_name=tf.sg_arg().set)

# mfcc feature of audio
x = data.mfcc
# target sentence label
y = data.label

# sequence length except zero-padding
seq_len = tf.not_equal(x.sg_sum(axis=2), 0.).sg_int().sg_sum(axis=1)

#
# Testing Graph
#

# encode audio feature
logit = get_logit(x, voca_size=voca_size)
示例#2
0
    "test fraction ratio to whole data set. The default is 1.0(=whole set)"))

#
# hyper parameters
#

# batch size
batch_size = 16

#
# inputs
#

# corpus input tensor ( with QueueRunner )
print 'building corpus'
data = SpeechCorpus(batch_size=batch_size, set_name=tf.sg_arg().set)

# mfcc feature of audio
x = data.mfcc
# target sentence label
y = data.label

filenames_t = data.filenames

# sequence length except zero-padding
seq_len = tf.not_equal(x.sg_sum(axis=2), 0.).sg_int().sg_sum(axis=1)

#
# Testing Graph
#
示例#3
0
                                           seq_len,
                                           merge_repeated=False)

# to dense tensor
y = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape,
                       decoded[0].values) + 1

#
# regcognize wave file
#

# command line argument for input wave file path
tf.sg_arg_def(file=('', 'speech wave file to recognize.'))

# load wave file
wav, _ = librosa.load(tf.sg_arg().file, mono=True, sr=16000)
# get mfcc feature
mfcc = np.transpose(np.expand_dims(librosa.feature.mfcc(wav, 16000), axis=0),
                    [0, 2, 1])

# run network
with tf.Session() as sess:

    # init variables
    tf.sg_init(sess)

    # restore parameters
    saver = tf.train.Saver()
    saver.restore(sess, tf.train.latest_checkpoint('asset/train'))
    # run session
    label = sess.run(y, feed_dict={x: mfcc})
示例#4
0
#decoded, _ = tf.nn.ctc_beam_search_decoder(logit.sg_transpose(perm=[1, 0, 2]), seq_len, merge_repeated=False)

# to dense tensor
#y = tf.sparse_to_dense(decoded[0].indices, decoded[0].shape, decoded[0].values) + 1

y = tf.nn.softmax(logit)

#
# regcognize wave file
#

# command line argument for input wave file path
tf.sg_arg_def(file=('', 'speech wave file to recognize.'))

# load wave file
wav, sr = librosa.load(tf.sg_arg().file, mono=True)
# get mfcc feature
mfcc = np.transpose(
    np.expand_dims(librosa.feature.mfcc(wav, sr, n_mfcc=20), axis=0),
    [0, 2, 1])
mfcc = mfcc[:, :100]

# run network
with tf.Session() as sess:

    # init variables
    tf.sg_init(sess)

    # restore parameters
    saver = tf.train.Saver()
    saver.restore(sess, tf.train.latest_checkpoint('asset/train/ckpt'))
示例#5
0
import data


tf.sg_verbosity(10)

batch_size = 1     # batch size
vocab_size = data.vocab_size

x = tf.placeholder(dtype=tf.sg_floatx, shape=(batch_size, None, 20))
seq_len = tf.not_equal(x.sg_sum(axis=2), 0.).sg_int().sg_sum(axis=1)

logit = get_logit(x, voca_size=vocab_size)

decoded, _ = tf.nn.ctc_beam_search_decoder(logit.sg_transpose(perm=[1, 0, 2]), seq_len, merge_repeated=False)

y = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values) + 1

tf.sg_arg_def(file=('', 'speech wave file to recognize.'))
wav, _ = librosa.load(tf.sg_arg().file, mono=True, sr=16000)
mfcc = np.transpose(np.expand_dims(librosa.feature.mfcc(wav, 16000), axis=0), [0, 2, 1])

with tf.Session() as sess:

    tf.sg_init(sess)

    saver = tf.train.Saver()
    saver.restore(sess, tf.train.latest_checkpoint('asset/train'))
    label = sess.run(y, feed_dict={x: mfcc})

    data.print_index(label)
示例#6
0
tf.sg_arg_def(frac=(1.0, "test fraction ratio to whole data set. The default is 1.0(=whole set)"))


#
# hyper parameters
#

# batch size
batch_size = 16

#
# inputs
#

# corpus input tensor ( with QueueRunner )
data = SpeechCorpus(batch_size=batch_size, set_name=tf.sg_arg().set)

# mfcc feature of audio
x = data.mfcc
# target sentence label
y = data.label

# sequence length except zero-padding
seq_len = tf.not_equal(x.sg_sum(axis=2), 0.).sg_int().sg_sum(axis=1)

#
# Testing Graph
#

# encode audio feature
logit = get_logit(x, voca_size=voca_size)