예제 #1
0
    def decode(self, predictions, seq_len,k):
        #print(target.get_shape().as_list(),'target')
        if self.ctc_decoder == 'greedy':
                decoded, log_prob = ctc_ops.ctc_greedy_decoder(predictions, seq_len)
        elif self.ctc_decoder == 'beam_search':
                decoded, log_prob = ctc_ops.ctc_beam_search_decoder(predictions, seq_len,top_paths=k)
        else:
                raise Exception("model type not supported: {}".format(self.ctc_decoder))

        return decoded
예제 #2
0
 def setup_decoder(self):
     with tf.name_scope("decode"):
         if self.beam_search_decoder == 'default':               
             self.decoded, self.log_prob = ctc_ops.ctc_beam_search_decoder(
                 self.logits, self.seq_length, merge_repeated=False)
         elif self.beam_search_decoder == 'greedy':
             self.decoded, self.log_prob = ctc_ops.ctc_greedy_decoder(
                 self.logits, self.seq_length, merge_repeated=False)
         else:
             logging.warning("Invalid beam search decoder option selected!")
예제 #3
0
    def ctc_decode(self,
                   y_pred,
                   input_length,
                   greedy=True,
                   beam_width=100,
                   top_paths=1,
                   merge_repeated=False):
        """Decodes the output of a softmax.
        Can use either greedy search (also known as best path)
        or a constrained dictionary search.
        # Arguments
            y_pred: tensor `(samples, time_steps, num_categories)`
                containing the prediction, or output of the softmax.
            input_length: tensor `(samples, )` containing the sequence length for
                each batch item in `y_pred`.
            greedy: perform much faster best-path search if `True`.
                This does not use a dictionary.
            beam_width: if `greedy` is `False`: a beam search decoder will be used
                with a beam of this width.
            top_paths: if `greedy` is `False`,
                how many of the most probable paths will be returned.
            merge_repeated: if `greedy` is `False`,
                merge repeated classes in the output beams.
        # Returns
            Tuple:
                List: if `greedy` is `True`, returns a list of one element that
                    contains the decoded sequence.
                    If `False`, returns the `top_paths` most probable
                    decoded sequences.
                    Important: blank labels are returned as `-1`.
                Tensor `(top_paths, )` that contains
                    the log probability of each decoded sequence.
        """
        _EPSILON = 1e-7
        y_pred = tf_math_ops.log(
            tf.transpose(y_pred, perm=[1, 0, 2]) + _EPSILON)
        input_length = tf.cast(input_length, tf.int32)

        if greedy:
            (decoded, log_prob) = ctc_ops.ctc_greedy_decoder(
                inputs=y_pred, sequence_length=input_length)
        else:
            (decoded, log_prob) = ctc_ops.ctc_beam_search_decoder(
                inputs=y_pred,
                sequence_length=input_length,
                beam_width=beam_width,
                top_paths=top_paths,
                merge_repeated=merge_repeated)

        decoded_dense = []
        for st in decoded:
            dense_tensor = tf.sparse.to_dense(st, default_value=-1)
            decoded_dense.append(dense_tensor)
        return decoded_dense, log_prob
예제 #4
0
    def build_graph(self, args, maxTimeSteps):
        self.graph = tf.Graph()
        with self.graph.as_default():
            self.inputX = tf.placeholder(
                tf.float32,
                shape=(maxTimeSteps, args.batch_size,
                       args.num_feature))  #[maxL,32,39]
            inputXrs = tf.reshape(self.inputX, [-1, args.num_feature])
            self.inputList = tf.split(
                inputXrs, maxTimeSteps,
                0)  #convert inputXrs from [32*maxL,39] to [32,maxL,39]

            self.targetIxs = tf.placeholder(tf.int64)
            self.targetVals = tf.placeholder(tf.int32)
            self.targetShape = tf.placeholder(tf.int64)
            self.targetY = tf.SparseTensor(self.targetIxs, self.targetVals,
                                           self.targetShape)
            self.seqLengths = tf.placeholder(tf.int32, shape=(args.batch_size))
            depth = 10
            width = 8
            self.config = {
                'name': 'residual network',
                'num_layer': depth,
                'num_featuremap': width,
                'num_class': args.num_class,
                'optimizer': args.optimizer,
                'learning rate': args.learning_rate
            }

            inpt = tf.reshape(
                self.inputX,
                [args.batch_size, maxTimeSteps, args.num_feature, 1])
            conv_output = build_resnet(inpt, maxTimeSteps, depth, width,
                                       args.num_class)
            self.loss = tf.reduce_mean(
                ctc.ctc_loss(self.targetY, conv_output, self.seqLengths))
            self.optimizer = args.optimizer(args.learning_rate).minimize(
                self.loss)
            self.logitsMaxTest = tf.slice(tf.argmax(conv_output, 2), [0, 0],
                                          [self.seqLengths[0], 1])
            self.predictions = tf.to_int32(
                ctc.ctc_beam_search_decoder(conv_output,
                                            self.seqLengths)[0][0])
            self.errorRate = tf.reduce_sum(
                tf.edit_distance(self.predictions,
                                 self.targetY,
                                 normalize=False)) / tf.to_float(
                                     tf.size(self.targetY.values))
            self.initial_op = tf.global_variables_initializer()
            self.saver = tf.train.Saver(tf.global_variables(),
                                        max_to_keep=2,
                                        keep_checkpoint_every_n_hours=1)
            self.var_op = tf.global_variables()
            self.var_trainable_op = tf.trainable_variables()
예제 #5
0
def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
    """Decodes the output of a softmax.

    Can use either greedy search (also known as best path)
    or a constrained dictionary search.

    # Arguments
        y_pred: tensor `(samples, time_steps, num_categories)`
            containing the prediction, or output of the softmax.
        input_length: tensor `(samples, )` containing the sequence length for
            each batch item in `y_pred`.
        greedy: perform much faster best-path search if `true`.
            This does not use a dictionary.
        beam_width: if `greedy` is `false`: a beam search decoder will be used
            with a beam of this width.
        top_paths: if `greedy` is `false`,
            how many of the most probable paths will be returned.

    # Returns
        Tuple:
            List: if `greedy` is `true`, returns a list of one element that
                contains the decoded sequence.
                If `false`, returns the `top_paths` most probable
                decoded sequences.
                Important: blank labels are returned as `-1`.
            Tensor `(top_paths, )` that contains
                the log probability of each decoded sequence.
    """
    y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + 1e-8)
    input_length = tf.to_int32(input_length)

    if greedy:
        (decoded,
         log_prob) = ctc.ctc_greedy_decoder(inputs=y_pred,
                                            sequence_length=input_length,
                                            merge_repeated=False)
    else:
        (decoded,
         log_prob) = ctc.ctc_beam_search_decoder(inputs=y_pred,
                                                 sequence_length=input_length,
                                                 beam_width=beam_width,
                                                 top_paths=top_paths,
                                                 merge_repeated=False)

    decoded_dense = [
        tf.sparse_to_dense(st.indices,
                           st.dense_shape,
                           st.values,
                           default_value=-1) for st in decoded
    ]
    return (decoded_dense, log_prob)
    def build_graph(self, args, maxTimeSteps):
	self.graph = tf.Graph()
	with self.graph.as_default():
    	    self.inputX = tf.placeholder(tf.float32, shape=(maxTimeSteps, args.batch_size, args.num_feature)) #[maxL,32,39]
	    self.inputXX = tf.reshape(self.inputX,shape=(args.batch_size,maxTimeSteps,args.num_feature))
    	    inputXrs = tf.reshape(self.inputX, [-1, args.num_feature])
    	    #self.inputList = tf.split(0, maxTimeSteps, inputXrs) #convert inputXrs from [32*maxL,39] to [32,maxL,39]
    	    #self.inputnew = tf.reshape(self.inputX, [1, 0, 2])
            self.targetIxs = tf.placeholder(tf.int64)
    	    self.targetVals = tf.placeholder(tf.int32)
    	    self.targetShape = tf.placeholder(tf.int64)
    	    self.targetY = tf.SparseTensor(self.targetIxs, self.targetVals, self.targetShape)
            self.seqLengths = tf.placeholder(tf.int32, shape=(args.batch_size))
	    self.config = { 'name':args.model,
			    'rnncell':self.cell_fn,
			    'num_layer':args.num_layer,
			    'num_hidden':args.num_hidden,
			    'num_class':args.num_class,
			    'activation':args.activation,
			    'optimizer':args.optimizer,
			    'learning rate':args.learning_rate
	    }	    

	    # forward layer
            forwardH1 = self.cell_fn(args.num_hidden,activation=tf.nn.relu)
	    # backward layer
            backwardH1 = self.cell_fn(args.num_hidden,activation=tf.nn.relu)
	    # bi-directional layer
            fbH1, state = bidirectional_dynamic_rnn(forwardH1, backwardH1, self.inputXX, sequence_length=self.seqLengths, dtype=tf.float32, scope='BDRNN_H1')
	    fbH1 = tf.concat(2, fbH1)
	    print(fbH1.get_shape)
            shape = fbH1.get_shape().as_list()
	    fbH1 = tf.reshape(fbH1,[shape[0]*shape[1],-1]) #seq*batch,feature
	    fbH1_list = tf.split(0,shape[1],fbH1)
    	    logits = [build_forward_layer(t,[shape[2],args.num_class],kernel='linear') for t in fbH1_list]
    	    logits3d = tf.pack(logits)
    	    self.loss = tf.reduce_mean(ctc.ctc_loss(logits3d, self.targetY, self.seqLengths))
    	    self.optimizer = tf.train.AdamOptimizer(args.learning_rate).minimize(self.loss)
    	    self.logitsMaxTest = tf.slice(tf.argmax(logits3d, 2), [0, 0], [self.seqLengths[0], 1])
    	    self.predictions = tf.to_int32(ctc.ctc_beam_search_decoder(logits3d, self.seqLengths)[0][0])
    	    self.errorRate = tf.reduce_sum(tf.edit_distance(self.predictions, self.targetY, normalize=False))/tf.to_float(tf.size(self.targetY.values))
	    self.initial_op = tf.initialize_all_variables()
	    self.saver = tf.train.Saver(tf.all_variables(),max_to_keep=5,keep_checkpoint_every_n_hours=1)
	    self.logfile = args.log_dir+str(datetime.datetime.strftime(datetime.datetime.now(),'%Y-%m-%d %H:%M:%S')+'.txt').replace(' ','').replace('/','')
	    self.var_op = tf.all_variables()
	    self.var_trainable_op = tf.trainable_variables()
예제 #7
0
def ctc_decode(y_pred, input_length, max_output_length):
    """
    Cut down from https://github.com/keras-team/keras/blob/master/keras/backend/tensorflow_backend.py#L4170

    Decodes the output of a softmax.
    Uses greedy (best path) search.

    # Arguments
        y_pred: tensor `(samples, time_steps, num_categories)`
            containing the prediction, or output of the softmax.
        input_length: tensor `(samples, )` containing the sequence length for
            each batch item in `y_pred`.
        max_output_length: int giving the max output sequence length

    # Returns
        List: list of one element that contains the decoded sequence.
    """
    y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + K.epsilon())
    input_length = tf.to_int32((tf.squeeze(input_length, axis=-1)))

    #    (decoded, _) = ctc_ops.ctc_greedy_decoder(inputs=y_pred, sequence_length=input_length)
    (decoded,
     _) = ctc_ops.ctc_beam_search_decoder(inputs=y_pred,
                                          sequence_length=input_length,
                                          beam_width=10)

    st = decoded[0]
    decoded_dense = tf.sparse_to_dense(st.indices,
                                       st.dense_shape,
                                       st.values,
                                       default_value=-1)

    # Unfortunately, decoded_dense will be of different number of columns, depending on the decodings.
    # We need to get it all in one standard shape, so let's pad if necessary.
    max_length = max_output_length + 2  # giving 2 extra characters for CTC leeway
    cols = tf.shape(decoded_dense)[-1]

    def f1():
        return tf.pad(decoded_dense, [[0, 0], [0, max_length - cols]],
                      constant_values=-1)

    def f2():
        return decoded_dense

    return tf.cond(tf.less(cols, max_length), f1, f2)
    def ctc_complete_analysis_lambda_func(args, **arguments):
        """
        Complete CTC analysis using Keras and tensorflow
        WARNING : tf is required
        :param args:
            y_pred, labels, input_length, label_len
        :param arguments:
            greedy, beam_width, top_paths
        :return:
            ler = label error rate
        """

        y_pred, labels, input_length, label_len = args
        my_params = arguments

        assert (K.backend() == 'tensorflow')

        batch = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + 1e-8)
        input_length = tf.to_int32(tf.squeeze(input_length))

        greedy = my_params['greedy']
        beam_width = my_params['beam_width']
        top_paths = my_params['top_paths']

        if greedy:
            (decoded,
             log_prob) = ctc.ctc_greedy_decoder(inputs=batch,
                                                sequence_length=input_length)
        else:
            (decoded, log_prob) = ctc.ctc_beam_search_decoder(
                inputs=batch,
                sequence_length=input_length,
                beam_width=beam_width,
                top_paths=top_paths)

        cast_decoded = tf.cast(decoded[0], tf.float32)

        sparse_y = K.ctc_label_dense_to_sparse(
            labels, tf.cast(tf.squeeze(label_len), tf.int32))
        ed_tensor = tf_edit_distance(cast_decoded, sparse_y, norm=True)
        ler_per_seq = Kreshape_To1D(ed_tensor)

        return K.cast(ler_per_seq, dtype='float32')
예제 #9
0
    def loss(self):
        """
        定义loss
        :return:
        """
        # 调用ctc loss
        with tf.name_scope('loss'): #损失
            self.avg_loss = tf.reduce_mean(ctc_ops.ctc_loss(self.text, self.logits, self.seq_length))
            tf.summary.scalar('loss',self.avg_loss)
        # [optimizer]
        with tf.name_scope('train'): #训练过程
            self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.avg_loss)

        with tf.name_scope("decode"):
            self.decoded, log_prob = ctc_ops.ctc_beam_search_decoder(self.logits, self.seq_length, merge_repeated=False)

        with tf.name_scope("accuracy"):
            self.distance = tf.edit_distance(tf.cast(self.decoded[0], tf.int32), self.text)
            # 计算label error rate (accuracy)
            self.label_err = tf.reduce_mean(self.distance, name='label_error_rate')
            tf.summary.scalar('accuracy', self.label_err)
예제 #10
0
    def loss(self):
        """
        定义loss
        :return:
        """
        # 调用ctc loss
        with tf.name_scope('loss'): #损失
            self.avg_loss = tf.reduce_mean(ctc_ops.ctc_loss(self.text, self.logits, self.seq_length))
            tf.summary.scalar('loss',self.avg_loss)
        # [optimizer]
        with tf.name_scope('train'): #训练过程
            self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.avg_loss)

        with tf.name_scope("decode"):
            self.decoded, log_prob = ctc_ops.ctc_beam_search_decoder(self.logits, self.seq_length, merge_repeated=False)

        with tf.name_scope("accuracy"):
            self.distance = tf.edit_distance(tf.cast(self.decoded[0], tf.int32), self.text)
            # 计算label error rate (accuracy)
            self.label_err = tf.reduce_mean(self.distance, name='label_error_rate')
            tf.summary.scalar('accuracy', self.label_err)
예제 #11
0
    def build_graph(self, args, maxTimeSteps):
        self.graph = tf.Graph()
        with self.graph.as_default():
            self.inputX = tf.placeholder(
                tf.float32,
                shape=(maxTimeSteps, args.batch_size,
                       args.num_feature))  #[maxL,32,39]
            inputXrs = tf.reshape(self.inputX, [-1, args.num_feature])
            #self.inputList = tf.split(0, maxTimeSteps, inputXrs) #convert inputXrs from [32*maxL,39] to [32,maxL,39]
            self.inputList = tf.split(
                inputXrs, maxTimeSteps,
                0)  #convert inputXrs from [32*maxL,39] to [32,maxL,39]
            self.targetIxs = tf.placeholder(tf.int64)
            self.targetVals = tf.placeholder(tf.int32)
            self.targetShape = tf.placeholder(tf.int64)
            self.targetY = tf.SparseTensor(self.targetIxs, self.targetVals,
                                           self.targetShape)
            self.seqLengths = tf.placeholder(tf.int32, shape=(args.batch_size))
            self.config = {
                'name': args.model,
                'rnncell': self.cell_fn,
                'num_layer': args.num_layer,
                'num_hidden': args.num_hidden,
                'num_class': args.num_class,
                'activation': args.activation,
                'optimizer': args.optimizer,
                'learning rate': args.learning_rate
            }

            fbHrs = build_multi_dynamic_brnn(self.args, maxTimeSteps,
                                             self.inputX, self.cell_fn,
                                             self.seqLengths)
            with tf.name_scope('fc-layer'):
                with tf.variable_scope('fc'):
                    weightsClasses = tf.Variable(
                        tf.truncated_normal([args.num_hidden, args.num_class],
                                            name='weightsClasses'))
                    biasesClasses = tf.Variable(tf.zeros([args.num_class]),
                                                name='biasesClasses')
                    logits = [
                        tf.matmul(t, weightsClasses) + biasesClasses
                        for t in fbHrs
                    ]
            #logits3d = tf.pack(logits)
            logits3d = tf.stack(logits)
            self.loss = tf.reduce_mean(
                ctc.ctc_loss(self.targetY, logits3d, self.seqLengths))
            #self.var_op = tf.all_variables()
            self.var_op = tf.global_variables()
            self.var_trainable_op = tf.trainable_variables()

            if args.grad_clip == -1:
                # not apply gradient clipping
                self.optimizer = tf.train.AdamOptimizer(
                    args.learning_rate).minimize(self.loss)
            else:
                # apply gradient clipping
                grads, _ = tf.clip_by_global_norm(
                    tf.gradients(self.loss, self.var_trainable_op),
                    args.grad_clip)
                opti = tf.train.AdamOptimizer(args.learning_rate)
                self.optimizer = opti.apply_gradients(
                    zip(grads, self.var_trainable_op))
            self.logitsMaxTest = tf.slice(tf.argmax(logits3d, 2), [0, 0],
                                          [self.seqLengths[0], 1])
            self.predictions = tf.to_int32(
                ctc.ctc_beam_search_decoder(logits3d, self.seqLengths)[0][0])
            self.errorRate = tf.reduce_sum(
                tf.edit_distance(self.predictions,
                                 self.targetY,
                                 normalize=False)) / tf.to_float(
                                     tf.size(self.targetY.values))
            self.initial_op = tf.global_variables_initializer()
            self.saver = tf.train.Saver(tf.global_variables(),
                                        max_to_keep=5,
                                        keep_checkpoint_every_n_hours=1)
            self.logfile = args.log_dir + str(
                datetime.datetime.strftime(datetime.datetime.now(),
                                           '%Y-%m-%d %H:%M:%S') +
                '.txt').replace(' ', '').replace('/', '')
예제 #12
0
lstm_output_tr=tf.transpose(lstm_outputs_re, [1,0,2])
W2 = weight_variable([2*hidden_size,output_size])
b2 = bias_variable([output_size])
#n_batch, n_time_steps, n_features = l_in.input_var.shape #Unnecessary in this version. Just collecting the info so that we can reshape the output back to the original shape
l_reshape3 = tf.reshape(lstm_output_tr,[-1,2*hidden_size] )
h_2 = tf.matmul(l_reshape3,W2) + b2

l_reshape4 = tf.reshape(h_2,[-1,output_size])

l_soft = tf.nn.softmax(l_reshape4)
l_soft_reshaped = tf.reshape(l_soft,[-1,n_time_steps,output_size])
l_soft_tr = tf.transpose(l_soft_reshaped, [1,0,2])
loss = tf.reduce_mean(tf.nn.ctc_loss(l_soft_tr, targets,seqLengths))
optimizer = tf.train.AdamOptimizer(learningRate).minimize(loss)
logitsMaxTest = tf.slice(tf.argmax(l_soft_reshaped, 2), [0, 0], [seqLengths[0], 1])
predictions = tf.to_int32(ctc.ctc_beam_search_decoder(l_soft_reshaped , seqLengths)[0][0])
errorRate = tf.reduce_sum(tf.edit_distance(predictions, targets, normalize=False)) / \
                tf.to_float(tf.size(targets.values))
def getminibatch(x,y,bs):
    perm = np.random.permutation(len(x))
    toselect = perm[:bs]
    batch = {}
    batch['x'] = np.array([x[i] for i in toselect])
    batch['ind'], batch['val'], batch['shape'] = target_list_to_sparse_tensor([y[i] for i in toselect])
    batch['seqlen'] = np.zeros([bs])
    batch['seqlen'].fill(776)
    return batch

number_of_batches = 100
batch_size_var = 38
nEpochs = 100
예제 #13
0
    logits = tf.reshape(logits, [batch_s, -1, num_classes])

    # Swap dimensions to time major for CTC loss.
    logits = tf.transpose(logits, (1, 0, 2))

    loss = ctc.ctc_loss(targets, logits, seq_len)
    cost = tf.reduce_mean(loss)

    # Record the loss
    tf.contrib.deprecated.scalar_summary('loss', cost)

    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                           momentum=momentum,
                                           use_nesterov=True).minimize(cost)

    decoded, log_prob = ctc.ctc_beam_search_decoder(inputs=logits,
                                                    sequence_length=seq_len)

    # Label error rate using the edit distance between output and target
    ler = tf.reduce_mean(
        tf.edit_distance(tf.cast(decoded[0], tf.int32), targets))

    # Record the label error rate
    tf.contrib.deprecated.scalar_summary('label error rate', ler)

    saver = tf.train.Saver()
    merged = tf.contrib.deprecated.merge_all_summaries()
    train_writer = tf.summary.FileWriter('./summaries/train', graph)
    test_writer = tf.summary.FileWriter('./summaries/test', graph)


def test_decoding(input_feed_dict, input_original):
예제 #14
0
def train(audio_processer, num_inputs, num_classes, model_architecture,
          model_size_info, learning_rate, training_steps, batch_size, aligning,
          eval_step_interval, output_dir):

    X = tf.placeholder(
        dtype=tf.float32,
        shape=[None, audio_processer.get_max_step(aligning), num_inputs],
        name='input_tensor')
    sequence_len = tf.placeholder(dtype=tf.int32,
                                  shape=[None],
                                  name='sequence_len')
    Y = tf.sparse_placeholder(dtype=tf.int32, name='output_tensor')

    model_settings = prepare_model_settings(20, num_classes)
    logits, dropout_prob = create_model(X, sequence_len, model_settings,
                                        model_architecture, model_size_info,
                                        True)

    with tf.name_scope('loss'):
        avg_loss = tf.reduce_mean(ctc_ops.ctc_loss(Y, logits, sequence_len))
        tf.summary.scalar('loss', avg_loss)
    with tf.name_scope('train'):
        learning_rate_input = tf.placeholder(tf.float32, [],
                                             name='learning_rate_input')
        train_step = tf.train.AdamOptimizer(
            learning_rate=learning_rate_input).minimize(avg_loss)
    with tf.name_scope("decoder"):
        decoder, _ = ctc_ops.ctc_beam_search_decoder(logits,
                                                     sequence_len,
                                                     merge_repeated=False)
    with tf.name_scope("accuracy"):
        evaluation_step = tf.reduce_mean(
            tf.edit_distance(tf.cast(decoder[0], tf.int32), Y))
        tf.summary.scalar('accuracy', evaluation_step)

    if tf.test.gpu_device_name():
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
        sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    else:
        sess = tf.InteractiveSession()
    saver = tf.train.Saver(max_to_keep=2)
    sess.run(tf.global_variables_initializer())
    ckpt = tf.train.latest_checkpoint(output_dir + 'train/')
    if ckpt: saver.restore(sess, ckpt)

    merged_summaries = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(output_dir + 'train/logs/',
                                         sess.graph)

    num_train_batches = audio_processer.get_batch_count(batch_size, 'train')
    num_dev_batches = audio_processer.get_batch_count(batch_size, 'dev')

    total_training_step = sum(training_steps)
    for training_step in range(1, total_training_step + 1):
        total_train_loss = 0
        epoch_start = time.time()

        learning_rate_value = learning_rate[
            1 if training_step > training_steps[0] else 0]
        for batch in range(num_train_batches):
            data = audio_processer.get_data(batch * batch_size, batch_size,
                                            'train', aligning)
            #train_summary, loss, _ = sess.run([merged_summaries, avg_loss, train_step],
            loss, _ = sess.run(
                [avg_loss, train_step],
                feed_dict={
                    X: data[0],
                    Y: data[1],
                    sequence_len: data[2],
                    learning_rate_input: learning_rate_value,
                    dropout_prob: 0.95
                })
            #train_writer.add_summary(train_summary, batch)
            total_train_loss += loss

        time_cost = time.time() - epoch_start
        print('training step: %d/%d, train loss: %g, time cost: %.2fs' %
              (training_step, total_training_step,
               total_train_loss / num_train_batches, time_cost))

        if training_step % eval_step_interval == 0:
            saver.save(sess,
                       output_dir + "train/speech-model.ckpt",
                       global_step=training_step)

            rand_batch = random.randint(0, num_dev_batches - 1)
            data = audio_processer.get_data(rand_batch * batch_size,
                                            batch_size, 'dev', aligning)
            dev_accuracy = sess.run(evaluation_step,
                                    feed_dict={
                                        X: data[0],
                                        Y: data[1],
                                        sequence_len: data[2],
                                        dropout_prob: 1.0
                                    })
            print('WER: %.2f, training step: %d/%d' %
                  (dev_accuracy, training_step, total_training_step))

    total_test_accuracy = 0
    num_test_batches = audio_processer.get_batch_count(batch_size, 'test')
    for batch in range(1, num_test_batches + 1):
        data = audio_processer.get_data(batch * batch_size, batch_size, 'test',
                                        aligning)
        decodes, accuracy = sess.run([decoder[0], evaluation_step],
                                     feed_dict={
                                         X: data[0],
                                         Y: data[1],
                                         sequence_len: data[2],
                                         dropout_prob: 1.0
                                     })

        total_test_accuracy += accuracy
        dense_decodes = tf.sparse_tensor_to_dense(
            decodes, default_value=-1).eval(session=sess)
        dense_labels = sparse_tuple_to_string(test_data[1], lexicon)
        for orig, decode_array in zip(dense_labels, dense_decodes):
            decoded_str = trans_array_to_string(decode_array, lexicon)
            print('语音原始文本: {}'.format(orig))
            print('识别出来的文本: {}'.format(decoded_str))
            break

    print('Final WER: %.2f, train steps: %d' %
          (total_test_accuracy, total_training_step))
예제 #15
0
def continue_train():
    input_tensor = tf.placeholder(
        tf.float32, [None, None, n_input + (2 * n_input * n_context)],
        name='input')
    # ctc_loss计算需要使用sparse_placeholder生成SparseTensor
    targets = tf.sparse_placeholder(tf.int32, name='targets')  # 文本
    keep_dropout = tf.placeholder(tf.float32)
    seq_length = tf.placeholder(tf.int32, [None], name='seq_length')  # 序列长
    regularizer = tf.contrib.layers.l2_regularizer(REGULARIZATION_RATE)
    logits = inference(input_tensor, words_size + 1, True, keep_dropout,
                       regularizer, tf.to_int64(seq_length))
    avg_loss = tf.reduce_mean(ctc_ops.ctc_loss(
        targets, logits, seq_length)) + tf.add_n(tf.get_collection('losses'))
    learning_rate = 0.001
    optimizer = tf.train.AdamOptimizer(
        learning_rate=learning_rate).minimize(avg_loss)
    with tf.name_scope("decode"):
        decoded, log_prob = ctc_ops.ctc_beam_search_decoder(
            logits, seq_length, merge_repeated=False)
    with tf.name_scope("accuracy"):
        distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)
        # 计算label error rate (accuracy)
        ler = tf.reduce_mean(distance, name='label_error_rate')
    epochs = 1000
    #ckpt = tf.train.get_checkpoint_state(savedir)
    saver = tf.train.Saver(max_to_keep=5)
    #saver2 = tf.train.Saver(max_to_keep=5)  # 生成saver
    with tf.Session() as sess:
        choose_cpkt = "BiRNN.cpkt-204"
        sess.run(tf.global_variables_initializer())
        print_tensors_in_checkpoint_file(savedir + choose_cpkt, None, True)
        saver.restore(sess, savedir + choose_cpkt)
        #graph = tf.get_default_graph()
        #cur_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
        startepo = 204
        train_start = time.time()
        for epoch in range(startepo, epochs):  # 样本集迭代次数
            epoch_start = time.time()
            #if epoch < startepo:
            #    continue

            print("epoch start:", epoch + 1, "total epochs= ", epochs)
            ##run batch##
            n_batches_per_epoch = int(np.ceil(len(labels) / batch_size))
            print("total loop ", n_batches_per_epoch, "in one epoch,",
                  batch_size, "items in one loop")

            train_cost = 0
            train_ler = 0
            next_idx = 0

            for batch in range(n_batches_per_epoch):  # 一次batch_size,取多少次
                # 取数据
                next_idx, source, source_lengths, sparse_labels = next_batch(
                    labels, next_idx, batch_size)
                feed = {
                    input_tensor: source,
                    targets: sparse_labels,
                    seq_length: source_lengths,
                    keep_dropout: keep_dropout_rate
                }

                # 计算 avg_loss optimizer ;
                batch_cost, _ = sess.run([avg_loss, optimizer], feed_dict=feed)
                train_cost += batch_cost

                if (batch + 1) % 50 == 0:
                    print('loop:', batch + 1, 'Train cost: ',
                          train_cost / (batch + 1))
                    feed2 = {
                        input_tensor: source,
                        targets: sparse_labels,
                        seq_length: source_lengths,
                        keep_dropout: 1.0
                    }

                    d, train_ler = sess.run([decoded[0], ler], feed_dict=feed2)
                    dense_decoded = tf.sparse_tensor_to_dense(
                        d, default_value=-1).eval(session=sess)
                    dense_labels = base.sparse_tuple_to_texts_ch(
                        sparse_labels, words)

                    counter = 0
                    print('Label err rate: ', train_ler)
                    duration = time.time() - train_start
                    print('cost time: {:.2f} min'.format(duration / 60))
                    for orig, decoded_arr in zip(dense_labels, dense_decoded):
                        # convert to strings
                        decoded_str = base.ndarray_to_text_ch(
                            decoded_arr, words)
                        decoded_str = decoded_str.strip().strip('龚')
                        print(' file {}'.format(counter))
                        print('Original: {}'.format(orig))
                        print('Decoded:  {}'.format(decoded_str))
                        counter = counter + 1
                        break

            epoch_duration = time.time() - epoch_start

            log = 'Epoch {}/{}, train_cost: {:.3f}, train_ler: {:.3f}, time: {:.2f} sec'
            print(
                log.format(epoch + 1, epochs, train_cost, train_ler,
                           epoch_duration))
            saver.save(sess, savedir + "BiRNN.cpkt", global_step=epoch + 1)
            print("save cpkt-%s complete." % (epoch + 1))
예제 #16
0
# logits will be input for the loss function.
# nn_model is from the import statement in the load_model function
logits = BiRNN_model(input_tensor, tf.to_int64(seq_length), n_input, n_context,
                     words_size + 1, keep_dropout)

#调用ctc loss
avg_loss = tf.reduce_mean(ctc_ops.ctc_loss(targets, logits, seq_length))

#[optimizer]
learning_rate = 0.001
optimizer = tf.train.AdamOptimizer(
    learning_rate=learning_rate).minimize(avg_loss)

with tf.name_scope("decode"):
    decoded, log_prob = ctc_ops.ctc_beam_search_decoder(logits,
                                                        seq_length,
                                                        merge_repeated=False)

with tf.name_scope("accuracy"):
    distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)
    # 计算label error rate (accuracy)
    ler = tf.reduce_mean(distance, name='label_error_rate')

epochs = 100
savedir = "log/yuyinchalltest/"
saver = tf.train.Saver(max_to_keep=1)  # 生成saver
# create the session
sess = tf.Session()
# 没有模型的话,就重新初始化
sess.run(tf.global_variables_initializer())
예제 #17
0
    def _create_graph(self, num_hidden, batch_size, max_time_steps,
                      num_features, conv_depth, num_classes):
        self.graph = tf.Graph()
        with self.graph.as_default():
            # e.g: log filter bank or MFCC features
            # Has size [batch_size, max_time_steps, num_features], but the
            # batch_size and max_stepsize can vary along each step
            self.inputs = tf.placeholder(
                tf.float32, [batch_size, max_time_steps, num_features])

            # Here we use sparse_placeholder that will generate a
            # SparseTensor required by ctc_loss op.
            self.targets = tf.sparse_placeholder(tf.int32)

            # 1d array of size [batch_size]
            self.seq_len = tf.placeholder(tf.int32, [batch_size])

            # Defining the cell
            # Can be:
            #   tf.nn.rnn_cell.RNNCell
            #   tf.nn.rnn_cell.GRUCell
            cell = tf.nn.rnn_cell.LSTMCell(num_hidden, state_is_tuple=True)

            inputsW = (pt.wrap(tf.expand_dims(self.inputs, -1)).conv2d(
                3, conv_depth,
                activation_fn=tf.nn.relu).conv2d(3,
                                                 conv_depth,
                                                 activation_fn=tf.nn.relu))
            # The second output is the last state and we will no use that
            inputsW = tf.reshape(
                inputsW,
                [batch_size, max_time_steps, num_features * conv_depth])
            outputs, _ = tf.nn.dynamic_rnn(cell,
                                           inputsW,
                                           self.seq_len,
                                           dtype=tf.float32)
            # outputs, _ = tf.nn.dynamic_rnn(cell, inputs, self.seq_len, dtype=tf.float32)

            shape = tf.shape(self.inputs)
            batch_s, max_timesteps = shape[0], shape[1]

            # Reshaping to apply the same weights over the timesteps
            outputs = tf.reshape(outputs, [-1, num_hidden])

            # Truncated normal with mean 0 and stdev=0.1
            # Tip: Try another initialization
            # see https://www.tensorflow.org/versions/r0.9/api_docs/python/contrib.layers.html#initializers
            W = tf.Variable(
                tf.truncated_normal([num_hidden, num_classes], stddev=0.1))
            # Zero initialization
            # Tip: Is tf.zeros_initializer the same?
            b = tf.Variable(tf.constant(0., shape=[num_classes]))

            # Doing the affine projection
            logits = tf.matmul(outputs, W) + b

            # Reshaping back to the original shape
            logits = tf.reshape(logits, [batch_s, -1, num_classes])

            # Time major
            logits = tf.transpose(logits, (1, 0, 2))

            self.loss = tf.reduce_mean(
                ctc.ctc_loss(logits, self.targets, self.seq_len))
            self.logitsMaxTest = tf.slice(tf.argmax(logits, 2), [0, 0],
                                          [self.seq_len[0], 1])

            self.optimizer = tf.train.AdamOptimizer().minimize(self.loss)

            self.predictions = tf.to_int32(
                ctc.ctc_beam_search_decoder(logits, self.seq_len)[0][0])

            self.error_rate = tf.reduce_sum(tf.edit_distance(self.predictions, self.targets, normalize=False)) / \
                        tf.to_float(tf.size(self.targets.values))

            tf.scalar_summary('loss', self.loss)
            tf.scalar_summary('error_rate', self.error_rate)
            self.merged_summaries = tf.merge_all_summaries()
예제 #18
0
def train_model(train_data=None, test_data=None, decode=False, file_decode=False):
    graph = tf.Graph()
    with graph.as_default():
        # e.g: log filter bank or MFCC features
        # Has size [batch_size, max_stepsize, num_features], but the
        # batch_size and max_stepsize can vary along each step
        inputs = tf.placeholder(tf.float32, [None, None, num_features])

        targets_idx = tf.placeholder(tf.int64)
        targets_val = tf.placeholder(tf.int32)
        targets_shape = tf.placeholder(tf.int64)
        targets = tf.SparseTensor(targets_idx, targets_val, targets_shape)
        # 1d array of size [batch_size]
        seq_len = tf.placeholder(tf.int32, [None])

        # Weights & biases
        weight_classes = tf.Variable(tf.truncated_normal([num_hidden, num_classes],
                                                         mean=0, stddev=0.1,
                                                         dtype=tf.float32))
        bias_classes = tf.Variable(tf.zeros([num_classes]), dtype=tf.float32)

        # Network
        forward_cell = tf.nn.rnn_cell.LSTMCell(num_hidden, use_peepholes=True, state_is_tuple=True)
        backward_cell = tf.nn.rnn_cell.LSTMCell(num_hidden, use_peepholes=True, state_is_tuple=True)

        stack_forward_cell = tf.nn.rnn_cell.MultiRNNCell([forward_cell] * num_layers,
                                                         state_is_tuple=True)
        stack_backward_cell = tf.nn.rnn_cell.MultiRNNCell([backward_cell] * num_layers,
                                                          state_is_tuple=True)

        outputs, _ = tf.nn.bidirectional_dynamic_rnn(stack_forward_cell, 
                                                     stack_backward_cell,
                                                     inputs,
                                                     sequence_length=seq_len,
                                                     time_major=False, # [batch_size, max_time, num_hidden]
                                                     dtype=tf.float32)
        inputs_shape = tf.shape(inputs)
        batch_size = inputs_shape[0]

        """
        outputs_concate = tf.concat_v2(outputs, 2)
        outputs_concate = tf.reshape(outputs_concate, [-1, 2*num_hidden])
        # logits = tf.matmul(outputs_concate, weight_classes) + bias_classes
        """
        fw_output = tf.reshape(outputs[0], [-1, num_hidden])
        bw_output = tf.reshape(outputs[1], [-1, num_hidden])
        logits = tf.add(tf.add(tf.matmul(fw_output, weight_classes), tf.matmul(bw_output, weight_classes)), bias_classes)

        logits = tf.reshape(logits, [batch_size, -1, num_classes])
        loss = tf.reduce_mean(ctc_ops.ctc_loss(logits, targets, seq_len, time_major=False))
        optimizer = tf.train.MomentumOptimizer(learning_rate, momentum).minimize(loss)

        # Evaluating
        # decoded, log_prob = ctc_ops.ctc_greedy_decoder(tf.transpose(logits, perm=[1, 0, 2]), seq_len)
        decoded, log_prob = ctc_ops.ctc_beam_search_decoder(tf.transpose(logits, perm=[1, 0, 2]), seq_len)
        label_error_rate = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), targets))

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)


    with tf.Session(graph=graph, config=tf.ConfigProto(gpu_options=gpu_options)) as session:
        session.run(tf.global_variables_initializer())
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=0)
        if not decode:
            ckpt = tf.train.get_checkpoint_state(ENV.output)
            if ckpt:
                print('load', ckpt.model_checkpoint_path)
                saver.restore(session, ckpt.model_checkpoint_path)

            total_train_data = len(train_data)
            total_test_data = len(test_data)
            num_batch = total_train_data
            for curr_epoch in range(num_epochs):
                start = time.time()
                train_cost = 0
                train_ler = 0
                # Create the feed_dict for the placeholders filled with the next
                # `batch size` examples.
                for i in range(num_batch-1):
                    feed = {
                        inputs: train_data[i][0],
                        targets_idx: train_data[i][1][0],
                        targets_val: train_data[i][1][1],
                        targets_shape: train_data[i][1][2],
                        seq_len: train_data[i][2]
                    }
                    batch_cost, _ = session.run([loss, optimizer], feed)
                    train_cost += batch_cost*batch_size
                    train_ler += session.run(label_error_rate, feed_dict=feed)*batch_size
                    log = "Epoch {}/{}, iter {}, batch_cost {}"
                    logging.info(log.format(curr_epoch+1, num_epochs, i, batch_cost))

                train_cost /= num_batch
                train_ler /= num_batch
                saver.save(session, os.path.join(P.OUTPUT, 'best.ckpt'), global_step=curr_epoch)

                feed_test = {
                    inputs: test_data[0][0],
                    targets_idx: test_data[0][1][0],
                    targets_val: test_data[0][1][1],
                    targets_shape: train_data[0][1][2],
                    seq_len: test_data[0][2]
                }
                test_cost, test_ler = session.run([loss, label_error_rate], feed_dict=feed_test)
                log = "Epoch {}/{}, test_cost {}, test_ler {}"
                logging.info(log.format(curr_epoch+1, num_epochs, test_cost, test_ler))
        else:
            # DECODE
            ckpt = tf.train.get_checkpoint_state(P.MODEL_PATH)
            print('load', ckpt.model_checkpoint_path)
            saver = tf.train.Saver()
            saver.restore(session, ckpt.model_checkpoint_path)

            while True:
                # 准备输入文件
                if file_decode:
                    wav_file = raw_input('Enter the wav file path:')
                else:
                    wav_file = 'temp.wav'
                    raw_input('Press Enter to start...')
                    try:
                        sox = subprocess.Popen(['sox', '-d', '-b', '16', '-c', '1', '-r', '16000', 'temp.wav'])
                        sox.communicate()
                    except KeyboardInterrupt:
                        os.kill(sox.pid, signal.SIGTERM)
                        if sox.poll() is None:
                            time.sleep(2)
                    print('Done recording')
                features = process_wav(wav_file)
                batch_features = np.array([features for i in range(16)])
                batch_seq_len = np.array([features.shape[0] for i in range(16)])
                print(batch_features.shape)
                feed = {
                    inputs: batch_features,
                    seq_len: batch_seq_len
                }
                d, oc = session.run([decoded[0], outputs], feed_dict=feed)
                dsp = d.shape #[16 86]
                res = []
                print size(oc)
                for label in d.values[:dsp[1]]: # id of phoneme
                    for k, v in phoneme_set_39.items():
                        if v == label + 1:
                            res.append(k)           
                print(res)
    backwardH1 = tf.contrib.rnn.LSTMCell(nHidden, use_peepholes=True, state_is_tuple=True)
    fbH1, _, _ = tf.contrib.rnn.static_bidirectional_rnn(forwardH1, backwardH1, inputList, dtype=tf.float32,
                                                         scope='BDLSTM_H1')
    fbH1rs = [tf.reshape(t, [batchSize, 2, nHidden]) for t in fbH1]
    outH1 = [tf.reduce_sum(tf.multiply(t, weightsOutH1), reduction_indices=1) + biasesOutH1 for t in fbH1rs]

    logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in outH1]

    ####Optimizing
    logits3d = tf.stack(logits)
    loss = tf.reduce_mean(ctc.ctc_loss(targetY, logits3d, seqLengths))
    optimizer = tf.train.MomentumOptimizer(learningRate, momentum).minimize(loss)

    ####Evaluating
    logitsMaxTest = tf.slice(tf.argmax(logits3d, 2), [0, 0], [seqLengths[0], 1])
    predictions = tf.to_int32(ctc.ctc_beam_search_decoder(logits3d, seqLengths)[0][0])
    errorRate = tf.reduce_sum(tf.edit_distance(predictions, targetY, normalize=False)) / \
                tf.to_float(tf.size(targetY.values))

####Run session
with tf.Session(graph=graph) as session:
    print('Initializing')
    tf.global_variables_initializer().run()
    for epoch in range(nEpochs):
        print('Epoch', epoch+1, '...')
        batchErrors = np.zeros(len(batchedData))
        batchRandIxs = np.random.permutation(len(batchedData)) #randomize batch order
        for batch, batchOrigI in enumerate(batchRandIxs):
            batchInputs, batchTargetSparse, batchSeqLengths = batchedData[batchOrigI]
            batchTargetIxs, batchTargetVals, batchTargetShape = batchTargetSparse
            feedDict = {inputX: batchInputs, targetIxs: batchTargetIxs, targetVals: batchTargetVals,
예제 #20
0
def train(train_sample_files, train_vector_labels, test_sample_files,
          test_vector_labels, lexicon, num_inputs, num_contexts,
          training_steps, learning_rate, batch_size, summaries_dir, train_dir,
          eval_step_interval, model_architecture, model_size_info):

    use_gpu = False
    device_name = tf.test.gpu_device_name()
    if not device_name:
        warnings.warn(
            'No GPU found. Please use a GPU to train your neural network.')
    else:
        use_gpu = True
        print('Found GPU at: {}'.format(device_name))

    X = tf.placeholder(
        dtype=tf.float32,
        shape=[None, None, num_inputs + (2 * num_inputs * num_contexts)],
        name='input')
    sequence_len = tf.placeholder(dtype=tf.int32,
                                  shape=[None],
                                  name='sequence_len')
    Y = tf.sparse_placeholder(dtype=tf.int32)

    num_character = len(lexicon) + 1
    model_settings = prepare_model_settings(20, num_character, use_gpu)
    logits, dropout_prob = create_model(X, sequence_len, model_settings,
                                        model_architecture, model_size_info,
                                        True)

    with tf.name_scope('loss'):
        avg_loss = tf.reduce_mean(ctc_ops.ctc_loss(Y, logits, sequence_len))
        tf.summary.scalar('loss', avg_loss)
    with tf.name_scope('train'):
        train_step = tf.train.AdamOptimizer(
            learning_rate=learning_rate).minimize(avg_loss)
    with tf.name_scope("decoder"):
        decoder, _ = ctc_ops.ctc_beam_search_decoder(logits,
                                                     sequence_len,
                                                     merge_repeated=False)
    with tf.name_scope("accuracy"):
        evaluation_step = tf.reduce_mean(
            tf.edit_distance(tf.cast(decoder[0], tf.int32), Y))
        tf.summary.scalar('accuracy', evaluation_step)

    if use_gpu == True:
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
        sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    else:
        sess = tf.InteractiveSession()
    saver = tf.train.Saver(max_to_keep=1)

    sess.run(tf.global_variables_initializer())
    ckpt = tf.train.latest_checkpoint(train_dir)
    if ckpt is not None: saver.restore(sess, ckpt)

    merged_summaries = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(summaries_dir, sess.graph)

    num_train_batches = len(train_sample_files) // batch_size
    num_test_batches = len(test_sample_files) // batch_size

    for training_step in range(training_steps):
        total_loss = 0
        for train_batch in range(num_train_batches):
            sparse_labels, batch_samples, num_steps = get_next_batches(
                batch_size * train_batch, train_sample_files,
                train_vector_labels, num_contexts, batch_size)

            # train_summary, loss, _ = sess.run([merged_summaries, avg_loss, train_step],
            loss, _ = sess.run(
                [avg_loss, train_step],
                feed_dict={
                    X: batch_samples,
                    Y: sparse_labels,
                    sequence_len: num_steps,
                    dropout_prob: 0.95
                })
            # train_writer.add_summary(train_summary, train_batch)
            total_loss += loss

        print('training step: %d/%d, loss: %g' %
              (training_step + 1, training_steps,
               total_loss / num_train_batches))

        if (training_step + 1) % eval_step_interval == 0:
            saver.save(sess,
                       train_dir + "speech.ckpt",
                       global_step=training_step)

            total_test_accuracy = 0
            for test_batch in range(num_test_batches):
                sparse_labels, batch_samples, num_steps = get_next_batches(
                    batch_size * test_batch, test_sample_files,
                    test_vector_labels, num_contexts, batch_size)

                test_accuracy = evaluation_step.eval(
                    feed_dict={
                        X: batch_samples,
                        Y: sparse_labels,
                        sequence_len: num_steps,
                        dropout_prob: 1.0
                    })
                total_test_accuracy += test_accuracy

            print('WER: %.2f, training step: %d/%d' %
                  (total_test_accuracy / num_test_batches, training_step + 1,
                   training_steps))

    total_accuracy = 0
    for test_batch in range(num_test_batches):
        sparse_labels, batch_samples, num_steps = get_next_batches(
            batch_size * test_batch, test_sample_files, test_vector_labels,
            num_contexts, batch_size)

        decodes, accuracy = sess.run(
            [decoder[0], evaluation_step],
            feed_dict={
                X: batch_samples,
                Y: sparse_labels,
                sequence_len: num_steps,
                dropout_prob: 1.0
            })

        total_accuracy += accuracy
        dense_decodes = tf.sparse_tensor_to_dense(
            decodes, default_value=-1).eval(session=sess)
        dense_labels = trans_tuple_to_texts(sparse_labels, lexicon)

        for orig, decode_array in zip(dense_labels, dense_decodes):
            decoded_str = trans_array_to_text(decode_array, lexicon)
            print('语音原始文本: {}'.format(orig))
            print('识别出来的文本: {}'.format(decoded_str))
            break

    print('Final WER: %.2f, train steps: %d' %
          (total_accuracy / num_test_batches, training_steps))
예제 #21
0
def get_eval(logits3d, target_y, seq_lens):
    logits_test = tf.slice(tf.argmax(logits3d, 2), [0, 0], [seq_lens[0], 1])
    predictions = tf.to_int32(ctc.ctc_beam_search_decoder(logits3d, seq_lens)[0][0])
    error = tf.reduce_sum(tf.edit_distance(predictions, target_y, normalize=False))/tf.to_float(tf.size(target_y.values))
    return error, logits_test
예제 #22
0
파일: decode.py 프로젝트: KGPML/KGP-ASR
def runCTC(batch):
    INPUT_PATH = '../TRAIN/All/mfcc/' #directory of MFCC nFeatures x nFrames 2-D array .npy files
    TARGET_PATH = '../TRAIN/All/phone_y/' #directory of nPhonemes 1-D array .npy files


    ####Learning Parameters
    learningRate = 0.001
    momentum = 0.9
    nEpochs = 300
    batchSize = batch.shape[1]

    ####Network Parameters
    nFeatures = 39 #12 MFCC coefficients + energy, and derivatives
    nHidden = 256
    nClasses = 30 #39 phonemes, plus the "blank" for CTC

    ####Load data
    print('Loading data')
    with open('TIMIT_data_prepared_for_CTC.pkl','rb') as f:
        data= pickle.load(f)
    input_list = batch
    charmap = data['chars']
    print(charmap)
    charmap.append('_')
    #batchedData, maxTimeSteps = data_lists_to_batches(input_list, target_list, batchSize)
    maxTimeSteps = 776
    totalN = len(input_list)

    ####Define graph
    print('Defining graph')
    graph = tf.Graph()
    with graph.as_default():

        ####NOTE: try variable-steps inputs and dynamic bidirectional rnn, when it's implemented in tensorflow
            
        ####Graph input
        inputX = tf.placeholder(tf.float32, shape=(maxTimeSteps, batchSize, nFeatures))
        #Prep input data to fit requirements of rnn.bidirectional_rnn
        #  Reshape to 2-D tensor (nTimeSteps*batchSize, nfeatures)
        inputXrs = tf.reshape(inputX, [-1, nFeatures])
        #  Split to get a list of 'n_steps' tensors of shape (batch_size, n_hidden)
        inputList = tf.split(0, maxTimeSteps, inputXrs)
        targetIxs = tf.placeholder(tf.int64)
        targetVals = tf.placeholder(tf.int32)
        targetShape = tf.placeholder(tf.int64)
        targetY = tf.SparseTensor(targetIxs, targetVals, targetShape)
        seqLengths = tf.placeholder(tf.int32, shape=(batchSize))

        ####Weights & biases
        weightsOutH1 = tf.Variable(tf.truncated_normal([2, nHidden],
                                                       stddev=np.sqrt(2.0 / (2*nHidden))))
        biasesOutH1 = tf.Variable(tf.zeros([nHidden]))
        weightsOutH2 = tf.Variable(tf.truncated_normal([2, nHidden],
                                                       stddev=np.sqrt(2.0 / (2*nHidden))))
        biasesOutH2 = tf.Variable(tf.zeros([nHidden]))
        weightsClasses = tf.Variable(tf.truncated_normal([nHidden, nClasses],
                                                         stddev=np.sqrt(2.0 / nHidden)))
        biasesClasses = tf.Variable(tf.zeros([nClasses]))

        ####Network
        forwardH1 = rnn_cell.LSTMCell(nHidden, use_peepholes=True, state_is_tuple=True)
        backwardH1 = rnn_cell.LSTMCell(nHidden, use_peepholes=True, state_is_tuple=True)
        fbH1, _, _ = bidirectional_rnn(forwardH1, backwardH1, inputList, dtype=tf.float32,
                                           scope='BDLSTM_H1')
        fbH1rs = [tf.reshape(t, [batchSize, 2, nHidden]) for t in fbH1]
        outH1 = [tf.reduce_sum(tf.mul(t, weightsOutH1), reduction_indices=1) + biasesOutH1 for t in fbH1rs]

        logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in outH1]

        ####Optimizing
        logits3d = tf.pack(logits)
        loss = tf.reduce_mean(ctc.ctc_loss(logits3d, targetY, seqLengths))
        optimizer = tf.train.MomentumOptimizer(learningRate, momentum).minimize(loss)

        ####Evaluating
        logitsMaxTest = tf.slice(tf.argmax(logits3d,2), [0, 0], [seqLengths[0], 1])
        predictions = tf.to_int32(ctc.ctc_beam_search_decoder(logits3d, seqLengths)[0][0])
        errorRate = tf.reduce_sum(tf.edit_distance(predictions, targetY, normalize=False)) / \
                    tf.to_float(tf.size(targetY.values))

    ####Run session
    with tf.Session(graph=graph) as session:
        print('Initializing')
        saver = tf.train.Saver()
        
        ckpt = tf.train.get_checkpoint_state('/users/TeamASR/models')
        if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path):
            print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
            saver.restore(session, ckpt.model_checkpoint_path)
        else:
            print("Created model with fresh parameters.")
            session.run(tf.initialize_all_variables())
        feedDict = {inputX: batch, seqLengths: (np.ones([batchSize])*776)}
        logit = session.run([logits3d], feed_dict=feedDict)
    return logit
	print("building outH1 ")
	outH1 = [tf.reduce_sum(tf.multiply(t, weightsOutH1), axis=1) + biasesOutH1 for t in fbH1rs]
	print("building logits ")
	logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in outH1]
	print("len(outH1) %d"% len(outH1))
	####Optimizing
	print("building loss")
	logits3d = tf.stack(logits)
	loss = tf.reduce_mean(ctc.ctc_loss(logits3d, targetY, seqLengths))
	out = tf.identity(loss, 'ctc_loss_mean')
	optimizer = tf.train.MomentumOptimizer(learningRate, momentum).minimize(loss)

	####Evaluating
	print("building Evaluation")
	logitsMaxTest = tf.slice(tf.argmax(logits3d, 2), [0, 0], [seqLengths[0], 1])
	predictions = tf.to_int32(ctc.ctc_beam_search_decoder(logits3d, seqLengths)[0][0])
	reduced_sum = tf.reduce_sum(tf.edit_distance(predictions, targetY, normalize=False))
	errorRate = reduced_sum / tf.to_float(tf.size(targetY.values))

	check_op = tf.add_check_numerics_ops()
print("done building graph")

####Run session
with tf.Session(graph=graph) as session:
	try: merged = tf.summary.merge_all()
	except: merged = tf.summary.merge_all()
	try:writer = tf.summary.FileWriter("/tmp/basic_new", session.graph)
	except: writer = tf.summary.FileWriter("/tmp/basic_new", session.graph)
	try:saver = tf.train.Saver()  # defaults to saving all variables
	except:
		print("tf.train.Saver() broken in tensorflow 0.12")
    # Reshaping back to the original shape
    logits = tf.reshape(logits, [batch_s, -1, num_classes])

    # Swap dimensions to time major for CTC loss.
    logits = tf.transpose(logits, (1, 0, 2))

    loss = ctc.ctc_loss(targets, logits, seq_len)
    cost = tf.reduce_mean(loss)

    # Record the loss
    tf.contrib.deprecated.scalar_summary('loss', cost)

    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum, use_nesterov=True).minimize(cost)

    decoded, log_prob = ctc.ctc_beam_search_decoder(inputs=logits, sequence_length=seq_len)

    # Label error rate using the edit distance between output and target
    ler = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                          targets))

    # Record the label error rate
    tf.contrib.deprecated.scalar_summary('label error rate', ler)

    saver = tf.train.Saver()
    merged = tf.contrib.deprecated.merge_all_summaries()
    train_writer = tf.summary.FileWriter('./summaries/train', graph)
    test_writer = tf.summary.FileWriter('./summaries/test', graph)
    
def test_decoding(input_feed_dict, input_original):
    """
예제 #25
0
def runCTC(batch):
    INPUT_PATH = '../TRAIN/All/mfcc/'  #directory of MFCC nFeatures x nFrames 2-D array .npy files
    TARGET_PATH = '../TRAIN/All/phone_y/'  #directory of nPhonemes 1-D array .npy files

    ####Learning Parameters
    learningRate = 0.001
    momentum = 0.9
    nEpochs = 300
    batchSize = batch.shape[1]

    ####Network Parameters
    nFeatures = 39  #12 MFCC coefficients + energy, and derivatives
    nHidden = 256
    nClasses = 30  #39 phonemes, plus the "blank" for CTC

    ####Load data
    print('Loading data')
    with open('TIMIT_data_prepared_for_CTC.pkl', 'rb') as f:
        data = pickle.load(f)
    input_list = batch
    charmap = data['chars']
    print(charmap)
    charmap.append('_')
    #batchedData, maxTimeSteps = data_lists_to_batches(input_list, target_list, batchSize)
    maxTimeSteps = 776
    totalN = len(input_list)

    ####Define graph
    print('Defining graph')
    graph = tf.Graph()
    with graph.as_default():

        ####NOTE: try variable-steps inputs and dynamic bidirectional rnn, when it's implemented in tensorflow

        ####Graph input
        inputX = tf.placeholder(tf.float32,
                                shape=(maxTimeSteps, batchSize, nFeatures))
        #Prep input data to fit requirements of rnn.bidirectional_rnn
        #  Reshape to 2-D tensor (nTimeSteps*batchSize, nfeatures)
        inputXrs = tf.reshape(inputX, [-1, nFeatures])
        #  Split to get a list of 'n_steps' tensors of shape (batch_size, n_hidden)
        inputList = tf.split(0, maxTimeSteps, inputXrs)
        targetIxs = tf.placeholder(tf.int64)
        targetVals = tf.placeholder(tf.int32)
        targetShape = tf.placeholder(tf.int64)
        targetY = tf.SparseTensor(targetIxs, targetVals, targetShape)
        seqLengths = tf.placeholder(tf.int32, shape=(batchSize))

        ####Weights & biases
        weightsOutH1 = tf.Variable(
            tf.truncated_normal([2, nHidden],
                                stddev=np.sqrt(2.0 / (2 * nHidden))))
        biasesOutH1 = tf.Variable(tf.zeros([nHidden]))
        weightsOutH2 = tf.Variable(
            tf.truncated_normal([2, nHidden],
                                stddev=np.sqrt(2.0 / (2 * nHidden))))
        biasesOutH2 = tf.Variable(tf.zeros([nHidden]))
        weightsClasses = tf.Variable(
            tf.truncated_normal([nHidden, nClasses],
                                stddev=np.sqrt(2.0 / nHidden)))
        biasesClasses = tf.Variable(tf.zeros([nClasses]))

        ####Network
        forwardH1 = rnn_cell.LSTMCell(nHidden,
                                      use_peepholes=True,
                                      state_is_tuple=True)
        backwardH1 = rnn_cell.LSTMCell(nHidden,
                                       use_peepholes=True,
                                       state_is_tuple=True)
        fbH1, _, _ = bidirectional_rnn(forwardH1,
                                       backwardH1,
                                       inputList,
                                       dtype=tf.float32,
                                       scope='BDLSTM_H1')
        fbH1rs = [tf.reshape(t, [batchSize, 2, nHidden]) for t in fbH1]
        outH1 = [
            tf.reduce_sum(tf.mul(t, weightsOutH1), reduction_indices=1) +
            biasesOutH1 for t in fbH1rs
        ]

        logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in outH1]

        ####Optimizing
        logits3d = tf.pack(logits)
        loss = tf.reduce_mean(ctc.ctc_loss(logits3d, targetY, seqLengths))
        optimizer = tf.train.MomentumOptimizer(learningRate,
                                               momentum).minimize(loss)

        ####Evaluating
        logitsMaxTest = tf.slice(tf.argmax(logits3d, 2), [0, 0],
                                 [seqLengths[0], 1])
        predictions = tf.to_int32(
            ctc.ctc_beam_search_decoder(logits3d, seqLengths)[0][0])
        errorRate = tf.reduce_sum(tf.edit_distance(predictions, targetY, normalize=False)) / \
                    tf.to_float(tf.size(targetY.values))

    ####Run session
    with tf.Session(graph=graph) as session:
        print('Initializing')
        saver = tf.train.Saver()

        ckpt = tf.train.get_checkpoint_state('/users/TeamASR/models')
        if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path):
            print("Reading model parameters from %s" %
                  ckpt.model_checkpoint_path)
            saver.restore(session, ckpt.model_checkpoint_path)
        else:
            print("Created model with fresh parameters.")
            session.run(tf.initialize_all_variables())
        feedDict = {inputX: batch, seqLengths: (np.ones([batchSize]) * 776)}
        logit = session.run([logits3d], feed_dict=feedDict)
    return logit
예제 #26
0
b2 = bias_variable([output_size])
#n_batch, n_time_steps, n_features = l_in.input_var.shape #Unnecessary in this version. Just collecting the info so that we can reshape the output back to the original shape
l_reshape3 = tf.reshape(lstm_output_tr, [-1, 2 * hidden_size])
h_2 = tf.matmul(l_reshape3, W2) + b2

l_reshape4 = tf.reshape(h_2, [-1, output_size])

l_soft = tf.nn.softmax(l_reshape4)
l_soft_reshaped = tf.reshape(l_soft, [-1, n_time_steps, output_size])
l_soft_tr = tf.transpose(l_soft_reshaped, [1, 0, 2])
loss = tf.reduce_mean(tf.nn.ctc_loss(l_soft_tr, targets, seqLengths))
optimizer = tf.train.AdamOptimizer(learningRate).minimize(loss)
logitsMaxTest = tf.slice(tf.argmax(l_soft_reshaped, 2), [0, 0],
                         [seqLengths[0], 1])
predictions = tf.to_int32(
    ctc.ctc_beam_search_decoder(l_soft_reshaped, seqLengths)[0][0])
errorRate = tf.reduce_sum(tf.edit_distance(predictions, targets, normalize=False)) / \
                tf.to_float(tf.size(targets.values))


def getminibatch(x, y, bs):
    perm = np.random.permutation(len(x))
    toselect = perm[:bs]
    batch = {}
    batch['x'] = np.array([x[i] for i in toselect])
    batch['ind'], batch['val'], batch['shape'] = target_list_to_sparse_tensor(
        [y[i] for i in toselect])
    batch['seqlen'] = np.zeros([bs])
    batch['seqlen'].fill(776)
    return batch
        biasesOutH1 for t in fbH1rs
    ]

    logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in outH1]

    ####Optimizing
    logits3d = tf.pack(logits)
    loss = tf.reduce_mean(ctc.ctc_loss(logits3d, targetY, seqLengths))
    optimizer = tf.train.MomentumOptimizer(learningRate,
                                           momentum).minimize(loss)

    ####Evaluating
    logitsMaxTest = tf.slice(tf.argmax(logits3d, 2), [0, 0],
                             [seqLengths[0], 1])
    predictions = tf.to_int32(
        ctc.ctc_beam_search_decoder(logits3d, seqLengths)[0][0])
    errorRate = tf.reduce_sum(tf.edit_distance(predictions, targetY, normalize=False)) / \
                tf.to_float(tf.size(targetY.values))

####Run session
with tf.Session(graph=graph) as session:
    print('Initializing')
    tf.initialize_all_variables().run()
    for epoch in range(nEpochs):
        print('Epoch', epoch + 1, '...')
        batchErrors = np.zeros(len(batchedData))
        batchRandIxs = np.random.permutation(
            len(batchedData))  #randomize batch order
        for batch, batchOrigI in enumerate(batchRandIxs):
            batchInputs, batchTargetSparse, batchSeqLengths = batchedData[
                batchOrigI]
예제 #28
0
    def ctc_loss(self,
                 outputs,
                 targets,
                 seq_len,
                 num_classes,
                 initial_learning_rate,
                 keep_prob=0.8,
                 scopeN="l1-ctc_loss"):
        """Implements ctc loss
    
    @param outputs: [batch,h,w,chanels]
    @param targets: sparce tensor 
    @param seq_len: the length of the inputs sequences [batch]
    @param num_classes: the number of classes
    @param initial_learning_rate: learning rate
    @param keep_prob: if true dropout layer
    @param scopeN: the scope name
    
    @returns: list with [optimizer, cost, Inaccuracy- label error rate, decoded output of the batch]
    """
        with tf.name_scope('Train'):
            with tf.variable_scope("ctc_loss-" + scopeN) as scope:
                W = tf.Variable(
                    tf.truncated_normal([self.hidden * 2, num_classes],
                                        stddev=0.1))
                # Zero initialization
                b = tf.Variable(tf.constant(0., shape=[num_classes]))

            tf.summary.histogram('histogram-b-ctc', b)
            tf.summary.histogram('histogram-w-ctc', W)

            # Doing the affine projection
            logits = tf.matmul(outputs, W) + b

            if keep_prob is not None:
                logits = tf.nn.dropout(logits, keep_prob)

            # Reshaping back to the original shape
            logits = tf.reshape(logits,
                                [self.width, self.batch_size, num_classes])
            #logits =  tf.transpose(logits, [1,0,2])

            with tf.name_scope('CTC-loss'):
                loss = ctc_ops.ctc_loss(logits, targets, seq_len)
                cost = tf.reduce_mean(loss)

            with tf.name_scope('Optimizer'):
                if self.optimizer == "ADAM":
                    optimizer = tf.train.AdamOptimizer(
                        learning_rate=initial_learning_rate,
                        name="AdamOptimizer").minimize(cost)
                elif self.optimizer == "RMSP":
                    optimizer = tf.train.RMSPropOptimizer(
                        learning_rate=initial_learning_rate,
                        decay=self.decay,
                        momentum=self.momentum).minimize(cost)
                else:
                    raise Exception("model type not supported: {}".format(
                        self.optimizer))

            with tf.name_scope('Prediction'):
                if self.ctc_decoder == 'greedy':
                    decoded, log_prob = ctc_ops.ctc_greedy_decoder(
                        logits, seq_len)
                elif self.ctc_decoder == 'beam_search':
                    decoded, log_prob = ctc_ops.ctc_beam_search_decoder(
                        logits, seq_len)
                else:
                    raise Exception("model type not supported: {}".format(
                        self.ctc_decoder))

                # Inaccuracy: label error rate
                ler = tf.reduce_mean(
                    tf.edit_distance(tf.cast(decoded[0], tf.int32), targets))
        return optimizer, cost, ler, decoded
예제 #29
0
                logits = tf.transpose(logits, [1, 0, 2])

                return logits


datasets = read_data_sets("./data/ldc93s1", train_batch_size, dev_batch_size,
                          test_batch_size, n_input, n_context)

audio, audio_lengths, labels = datasets.train.next_batch()

logits = model(audio, audio_lengths, dropout_rate)
loss = ctc_ops.ctc_loss(logits, labels, audio_lengths)
avg_loss = tf.reduce_mean(loss)

decoded, _ = ctc_ops.ctc_beam_search_decoder(logits,
                                             audio_lengths,
                                             merge_repeated=False)

optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                   beta1=beta1,
                                   beta2=beta2,
                                   epsilon=epsilon)
optimize_op = optimizer.minimize(avg_loss)

session = tf.Session(config=session_config)
with session.as_default():
    tf.initialize_all_variables().run()
    tf.train.start_queue_runners()
    datasets.start_queue_threads(session)

    for epoch in range(training_iters):
예제 #30
0
    """
    fw_output = tf.reshape(outputs[0], [-1, num_hidden])
    bw_output = tf.reshape(outputs[1], [-1, num_hidden])
    logits = tf.add(
        tf.add(tf.matmul(fw_output, weight_classes),
               tf.matmul(bw_output, weight_classes)), bias_classes)

    logits = tf.reshape(logits, [batch_size, -1, num_classes])
    loss = tf.reduce_mean(
        ctc_ops.ctc_loss(logits, targets, seq_len, time_major=False))
    optimizer = tf.train.MomentumOptimizer(learning_rate,
                                           momentum).minimize(loss)

    # Evaluating
    # decoded, log_prob = ctc_ops.ctc_greedy_decoder(tf.transpose(logits, perm=[1, 0, 2]), seq_len)
    decoded, log_prob = ctc_ops.ctc_beam_search_decoder(
        tf.transpose(logits, perm=[1, 0, 2]), seq_len)
    label_error_rate = tf.reduce_mean(
        tf.edit_distance(tf.cast(decoded[0], tf.int32), targets))

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)

with tf.Session(graph=graph,
                config=tf.ConfigProto(gpu_options=gpu_options)) as session:
    # 加载模型
    ckpt = tf.train.get_checkpoint_state(model_folder)
    print('load', ckpt.model_checkpoint_path)
    saver = tf.train.Saver()
    saver.restore(session, ckpt.model_checkpoint_path)
    # 准备输入文件
    filenames = glob(P.TEST_DATA)