def call(self, audio_data, sample_rate=None): """ Caculate fbank && pitch(concat) features of wav. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: the samplerate of the signal we working with. :return: A tensor with shape (num_frames, dim_features), containing fbank && pitch feature of every frame in speech. """ p = self.config with tf.name_scope('fbank_pitch'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): fbank_feats = tf.squeeze(self.fbank(audio_data, sample_rate)) pitch_feats = tf.squeeze(self.pitch(audio_data, sample_rate)) fbank_pitch_feats = tf.concat([fbank_feats, pitch_feats], 1) return fbank_pitch_feats
def compute_mfcc(): parser = get_parser() args = parser.parse_args() config = {} config['sample_rate'] = int(args.sample_rate) config['upper_frequency_limit'] = float(args.upper_frequency_limit) config['lower_frequency_limit'] = float(args.lower_frequency_limit) config['filterbank_channel_count'] = float(args.filterbank_channel_count) config['window_length'] = args.window_length config['frame_length'] = args.frame_length config['output_type'] = args.output_type config['window_type'] = args.window_type config['snip_edges'] = args.snip_edges config['preeph_coeff'] = args.preeph_coeff config['remove_dc_offset'] = args.remove_dc_offset config['is_fbank'] = args.is_fbank config['cepstral_lifter'] = args.cepstral_lifter config['coefficient_count'] = args.coefficient_count mfcc = Mfcc.params(config).instantiate() with kaldiio.ReadHelper(args.rspecifier, segments=args.segments) as reader, \ KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method) as writer: for utt_id, (sample_rate, array) in reader: if sample_rate != args.sample_rate: args.sample_rate = sample_rate array = array.astype(np.float32) audio_data = tf.constant(array, dtype=tf.float32) mfcc_test = tf.squeeze(mfcc(audio_data, args.sample_rate)) sess = tf.Session() mfcc_feats = mfcc_test.eval(session=sess) writer[utt_id] = mfcc_feats
def call(self, audio_data, sample_rate=None): """ Caculate power spectrum or log power spectrum of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size N containing add-noise audio. """ p = self.config with tf.name_scope('add_rir_noise_aecres'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal( tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) add_rir_noise_aecres_out = py_x_ops.add_rir_noise_aecres( audio_data, sample_rate, if_add_rir=p.if_add_rir, rir_filelist=p.rir_filelist, if_add_noise=p.if_add_noise, snr_min=p.snr_min, snr_max=p.snr_max, noise_filelist=p.noise_filelist, if_add_aecres=p.if_add_aecres, aecres_filelist=p.aecres_filelist) return tf.squeeze(add_rir_noise_aecres_out)
def compute_fbank(): parser = get_parser() args = parser.parse_args() config = {} config['sample_rate'] = float(args.sample_rate) config['upper_frequency_limit'] = float(args.upper_frequency_limit) config['lower_frequency_limit'] = float(args.lower_frequency_limit) config['filterbank_channel_count'] = float(args.filterbank_channel_count) config['window_length'] = args.window_length config['frame_length'] = args.frame_length config['output_type'] = args.output_type fbank = Fbank.params(config).instantiate() with kaldiio.ReadHelper(args.rspecifier, segments=args.segments) as reader, \ KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method) as writer: for utt_id, (sample_rate, array) in reader: if sample_rate != args.sample_rate: args.sample_rate = sample_rate array = array.astype(np.float32) audio_data = tf.constant(array, dtype=tf.float32) fbank_test = tf.squeeze(fbank(audio_data, args.sample_rate)) sess = tf.compat.v1.Session() fbank_feats = fbank_test.eval(session=sess) writer[utt_id] = fbank_feats
def split_one_doc_to_true_len_sens(doc_t, split_token, padding_token, max_doc_len, max_sen_len): """ Split a document to sentences with true sentence lengths. doc_t: [doc_word_len] out_t: [max_doc_len, max_sen_len] """ if len(doc_t.get_shape()) == 1: split_token_index = tf.squeeze(tf.where(tf.equal(doc_t, split_token)), axis=1) split_token_index.set_shape([None]) split_len_part_1 = split_token_index[:1] + 1 split_len_part_2 = split_token_index[1:] - split_token_index[:-1] split_lens = tf.concat([split_len_part_1, split_len_part_2], axis=0) split_lens = cut_or_padding(split_lens, max_doc_len, padding_token=padding_token) new_doc_len = tf.reduce_sum(split_lens) splited_sentences = tf.split(doc_t[:new_doc_len], split_lens) splited_sentences = [ cut_or_padding(s, max_sen_len) for s in splited_sentences ] out_t = tf.stack(splited_sentences) padding_tokens = tf.multiply(tf.ones_like(out_t, dtype=tf.int32), padding_token) out_t = tf.where(tf.equal(out_t, split_token), padding_tokens, out_t) return out_t raise ValueError("doc_t should be a tensor with rank 1.")
def call(self, audio_data, sample_rate=None): """ Caculate power of every frame in speech. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return:A float tensor of size (1 * num_frames) containing power of every frame in speech. """ p = self.config with tf.name_scope('framepow'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal( tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) framepow = py_x_ops.frame_pow( audio_data, sample_rate, snip_edges=p.snip_edges, remove_dc_offset=p.remove_dc_offset, window_length=p.window_length, frame_length=p.frame_length) return tf.squeeze(framepow)
def call(self, audio_data, sample_rate=None): """ Caculate pitch features of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size (1, num_frames) containing pitch features of every frame in speech. """ p = self.config with tf.name_scope('pitch'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=float) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float)) with tf.control_dependencies([assert_op]): pitch = py_x_ops.pitch(audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length, thres_autoc=p.thres_autoc) pitch = tf.squeeze(pitch) pitch = tf.transpose(pitch[None, :]) return pitch
def attention(inputs, attention_size, time_major=False, return_alphas=False): """Attention layer.""" if isinstance(inputs, tuple): # In case of Bi-RNN, concatenate the forward and the backward RNN outputs. inputs = tf.concat(inputs, 2) if time_major: # (T,B,D) => (B,T,D) inputs = tf.transpose(inputs, [1, 0, 2]) time_size = inputs.shape[1].value # T value - time size of the RNN layer hidden_size = inputs.shape[ 2].value # D value - hidden size of the RNN layer # Trainable parameters W_omega = tf.get_variable(name='W_omega', initializer=tf.random_normal( [hidden_size, attention_size], stddev=0.1)) b_omega = tf.get_variable(name='b_omega', initializer=tf.random_normal([attention_size], stddev=0.1)) u_omega = tf.get_variable(name='u_omega', initializer=tf.random_normal([attention_size, 1], stddev=0.1)) # Applying fully connected layer with non-linear activation to each of the B*T timestamps; # the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size #v = tf.tanh(tf.tensordot(inputs, W_omega, axes=1) + b_omega) #v = tf.sigmoid(tf.tensordot(inputs, W_omega, axes=1) + b_omega) # (B, T, D) dot (D, Atten) logging.info('attention inputs: {}'.format(inputs.shape)) inputs_reshaped = tf.reshape(inputs, [-1, hidden_size]) dot = tf.matmul(inputs_reshaped, W_omega) dot = tf.reshape(dot, [-1, time_size, attention_size]) v = tf.sigmoid(dot + b_omega) logging.info(f'attention vector: {v.shape}') # For each of the timestamps its vector of size A from `v` is reduced with `u` vector # (B, T, Atten) dot (Atten) #vu = tf.tensordot(v, u_omega, axes=1) # (B,T) shape v = tf.reshape(v, [-1, attention_size]) vu = tf.matmul(v, u_omega) # (B,T) shape vu = tf.squeeze(vu, axis=-1) vu = tf.reshape(vu, [-1, time_size]) logging.info(f'attention energe: {vu.shape}') alphas = tf.nn.softmax(vu) # (B,T) shape also # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape # [batch, time] -> [batch, time, 1] alphas = tf.expand_dims(alphas, -1) # [batch, time, dim] -> [batch, dim] output = tf.reduce_sum(inputs * alphas, 1) if not return_alphas: return output return output, alphas
def read_wav(wavfile, params): ''' samples of shape [nsample] ''' contents = tf.read_file(wavfile) #pylint: disable=no-member waveforms = tf.audio.decode_wav( contents, desired_channels=params.audio_channels, #desired_samples=params.audio_sample_rate, ) return tf.squeeze(waveforms.audio, axis=-1)
def ctc_lambda_loss(logits, labels, input_length, label_length, blank_index=0): ''' ctc loss function psram: logits, (B, T, D) psram: input_length, (B, 1), input length of encoder psram: labels, (B, T) psram: label_length, (B, 1), label length for convert dense label to sparse returns: loss, scalar ''' ilen = tf.cond( pred=tf.equal(tf.rank(input_length), 1), true_fn=lambda: input_length, false_fn=lambda: tf.squeeze(input_length), ) ilen = tf.cast(ilen, tf.int32) olen = tf.cond( pred=tf.equal(tf.rank(label_length), 1), true_fn=lambda: label_length, false_fn=lambda: tf.squeeze(label_length)) olen = tf.cast(olen, tf.int32) deps = [ tf.assert_rank(labels, 2, name='label_rank_check'), tf.assert_rank(logits, 3, name='logits_rank_check'), tf.assert_rank(ilen, 1, name='src_len_rank_check'), # input_length tf.assert_rank(olen, 1, name='tgt_len_rank_check'), # output_length ] labels, logits = ctc_data_transform(labels, logits, blank_index) with tf.control_dependencies(deps): # (B, 1) # blank index is consistent with Espnet, zero batch_loss = tf.nn.ctc_loss( labels=labels, inputs=logits, sequence_length=ilen, time_major=False, preprocess_collapse_repeated=False, ctc_merge_repeated=True, ignore_longer_outputs_than_inputs=False) return batch_loss
def load_textline_dataset(paths, column_num): """Load raw data for text task.""" ds = tf.data.TextLineDataset(paths) ds = ds.map(lambda x: tf.squeeze( tf.strings.split(x, sep="\t", result_type="RaggedTensor"), axis=0)) ds = ds.filter(lambda line: tf.equal(tf.size(line), column_num)) ds_list = [] for i in range(column_num): ds_list.append(ds.map(lambda x: x[i])) return tuple(ds_list)
def call(self, wavfile): """ Get audio data and sample rate from a wavfile. :param wavfile: filepath of wav :return: 2 values. The first is a Tensor of audio data. The second return value is the sample rate of the input wav file, which is a tensor with float dtype. """ p = self.config contents = tf.io.read_file(wavfile) audio_data, sample_rate = tf.audio.decode_wav( contents, desired_channels=p.audio_channels) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float)) with tf.control_dependencies([assert_op]): return tf.squeeze(audio_data, axis=-1), tf.cast(sample_rate, dtype=float)
def read_wav(wavfile, params): ''' samples of shape [nsample] ''' contents = tf.read_file(wavfile) #pylint: disable=no-member waveforms = tf.audio.decode_wav( contents, desired_channels=params.audio_channels, #desired_samples=params.audio_sample_rate, ) #waveforms = tf.contrib.ffmpeg.decode_audio( # contents, # file_format='wav', # samples_per_second = params.audio_sample_rate, # channel_count=params.audio_channels, #) #return waveforms[:, 0] return tf.squeeze(waveforms.audio, axis=-1)
def call(self, tensors): """Attention layer.""" left, right = tensors len_left = left.shape[1] len_right = right.shape[1] tensor_left = tf.expand_dims(left, axis=2) tensor_right = tf.expand_dims(right, axis=1) tensor_left = tf.tile(tensor_left, [1, 1, len_right, 1]) tensor_right = tf.tile(tensor_right, [1, len_left, 1, 1]) tensor_merged = tf.concat([tensor_left, tensor_right], axis=-1) middle_output = self.middle_layer(tensor_merged) attn_scores = self.attn(middle_output) attn_scores = tf.squeeze(attn_scores, axis=3) exp_attn_scores = tf.exp( attn_scores - tf.reduce_max(attn_scores, axis=-1, keepdims=True)) exp_sum = tf.reduce_sum(exp_attn_scores, axis=-1, keepdims=True) attention_weights = exp_attn_scores / exp_sum return tf.matmul(attention_weights, right)
def compute_pitch(): parser = get_parser() args = parser.parse_args() config = {} config['sample_rate'] = int(args.sample_rate) config['window_length'] = args.window_length config['frame_length'] = args.frame_length config['snip_edges'] = args.snip_edges config['preemph_coeff'] = args.preemph_coeff config['min_f0'] = args.min_f0 config['max_f0'] = args.max_f0 config['soft_min_f0'] = args.soft_min_f0 config['penalty_factor'] = args.penalty_factor config['lowpass_cutoff'] = args.lowpass_cutoff config['resample_freq'] = args.resample_freq config['delta_pitch'] = args.delta_pitch config['nccf_ballast'] = args.nccf_ballast config['lowpass_filter_width'] = args.lowpass_filter_width config['upsample_filter_width'] = args.upsample_filter_width config['max_frames_latency'] = args.max_frames_latency config['frames_per_chunk'] = args.frames_per_chunk config['simulate_first_pass_online'] = args.simulate_first_pass_online config['recompute_frame'] = args.recompute_frame config['nccf_ballast_online'] = args.nccf_ballast_online pitch = Pitch.params(config).instantiate() with kaldiio.ReadHelper(args.rspecifier, segments=args.segments) as reader, \ KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method) as writer: for utt_id, (sample_rate, array) in reader: if sample_rate != args.sample_rate: args.sample_rate = sample_rate array = array.astype(np.float32) audio_data = tf.constant(array, dtype=tf.float32) pitch_test = tf.squeeze(pitch(audio_data, args.sample_rate)) sess = tf.Session() pitch_feats = pitch_test.eval(session=sess) writer[utt_id] = pitch_feats